{"id":313,"date":"2014-09-11T17:25:29","date_gmt":"2014-09-11T15:25:29","guid":{"rendered":"http:\/\/blog.defrent.de\/?p=313"},"modified":"2019-03-21T16:35:04","modified_gmt":"2019-03-21T15:35:04","slug":"file-size-observations-on-the-iate-tbx-termbase","status":"publish","type":"post","link":"https:\/\/www.defrent.de\/fr\/2014\/09\/file-size-observations-on-the-iate-tbx-termbase\/","title":{"rendered":"File size observations on the IATE TBX Termbase"},"content":{"rendered":"<div class=\"shariff shariff-align-left shariff-widget-align-left\" style=\"display:none\"><ul class=\"shariff-buttons theme-round orientation-horizontal buttonsize-small\"><li class=\"shariff-button mastodon\" style=\"background-color:#ccc\"><a href=\"https:\/\/s2f.kytta.dev\/?text=File%20size%20observations%20on%20the%20IATE%20TBX%20Termbase https%3A%2F%2Fwww.defrent.de%2Ffr%2F2014%2F09%2Ffile-size-observations-on-the-iate-tbx-termbase%2F via @defrentck@hessen.social\" title=\"Envoyer par Mastodon\" aria-label=\"Envoyer par Mastodon\" role=\"button\" rel=\"noopener nofollow\" class=\"shariff-link\" style=\"; background-color:#79b428; color:#fff\" target=\"_blank\"><span class=\"shariff-icon\" style=\"\"><svg width=\"75\" height=\"79\" viewBox=\"0 0 75 79\" fill=\"none\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\"><path d=\"M37.813-.025C32.462-.058 27.114.13 21.79.598c-8.544.621-17.214 5.58-20.203 13.931C-1.12 23.318.408 32.622.465 41.65c.375 7.316.943 14.78 3.392 21.73 4.365 9.465 14.781 14.537 24.782 15.385 7.64.698 15.761-.213 22.517-4.026a54.1 54.1 0 0 0 .01-6.232c-6.855 1.316-14.101 2.609-21.049 1.074-3.883-.88-6.876-4.237-7.25-8.215-1.53-3.988 3.78-.43 5.584-.883 9.048 1.224 18.282.776 27.303-.462 7.044-.837 14.26-4.788 16.65-11.833 2.263-6.135 1.215-12.79 1.698-19.177.06-3.84.09-7.692-.262-11.52C72.596 7.844 63.223.981 53.834.684a219.453 219.453 0 0 0-16.022-.71zm11.294 12.882c5.5-.067 10.801 4.143 11.67 9.653.338 1.48.471 3 .471 4.515v21.088h-8.357c-.07-7.588.153-15.182-.131-22.765-.587-4.368-7.04-5.747-9.672-2.397-2.422 3.04-1.47 7.155-1.67 10.735v6.392h-8.307c-.146-4.996.359-10.045-.404-15.002-1.108-4.218-7.809-5.565-10.094-1.666-1.685 3.046-.712 6.634-.976 9.936v14.767h-8.354c.109-8.165-.238-16.344.215-24.5.674-5.346 5.095-10.389 10.676-10.627 4.902-.739 10.103 2.038 12.053 6.631.375 1.435 1.76 1.932 1.994.084 1.844-3.704 5.501-6.739 9.785-6.771.367-.044.735-.068 1.101-.073z\"\/><defs><linearGradient id=\"paint0_linear_549_34\" x1=\"37.0692\" y1=\"0\" x2=\"37.0692\" y2=\"79\" gradientUnits=\"userSpaceOnUse\"><stop stop-color=\"#6364FF\"\/><stop offset=\"1\" stop-color=\"#563ACC\"\/><\/linearGradient><\/defs><\/svg><\/span><\/a><\/li><li class=\"shariff-button linkedin\" style=\"background-color:#ccc\"><a href=\"https:\/\/www.linkedin.com\/sharing\/share-offsite\/?url=https%3A%2F%2Fwww.defrent.de%2Ffr%2F2014%2F09%2Ffile-size-observations-on-the-iate-tbx-termbase%2F\" title=\"Envoyer par LinkedIn\" aria-label=\"Envoyer par LinkedIn\" role=\"button\" rel=\"noopener nofollow\" class=\"shariff-link\" style=\"; background-color:#79b428; color:#fff\" target=\"_blank\"><span class=\"shariff-icon\" style=\"\"><svg width=\"32px\" height=\"20px\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" viewBox=\"0 0 27 32\"><path fill=\"#0077b5\" d=\"M6.2 11.2v17.7h-5.9v-17.7h5.9zM6.6 5.7q0 1.3-0.9 2.2t-2.4 0.9h0q-1.5 0-2.4-0.9t-0.9-2.2 0.9-2.2 2.4-0.9 2.4 0.9 0.9 2.2zM27.4 18.7v10.1h-5.9v-9.5q0-1.9-0.7-2.9t-2.3-1.1q-1.1 0-1.9 0.6t-1.2 1.5q-0.2 0.5-0.2 1.4v9.9h-5.9q0-7.1 0-11.6t0-5.3l0-0.9h5.9v2.6h0q0.4-0.6 0.7-1t1-0.9 1.6-0.8 2-0.3q3 0 4.9 2t1.9 6z\"\/><\/svg><\/span><\/a><\/li><li class=\"shariff-button xing\" style=\"background-color:#ccc\"><a href=\"https:\/\/www.xing.com\/spi\/shares\/new?url=https%3A%2F%2Fwww.defrent.de%2Ffr%2F2014%2F09%2Ffile-size-observations-on-the-iate-tbx-termbase%2F\" title=\"Envoyer par XING\" aria-label=\"Envoyer par XING\" role=\"button\" rel=\"noopener nofollow\" class=\"shariff-link\" style=\"; background-color:#79b428; color:#fff\" target=\"_blank\"><span class=\"shariff-icon\" style=\"\"><svg width=\"32px\" height=\"20px\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" viewBox=\"0 0 25 32\"><path fill=\"#126567\" d=\"M10.7 11.9q-0.2 0.3-4.6 8.2-0.5 0.8-1.2 0.8h-4.3q-0.4 0-0.5-0.3t0-0.6l4.5-8q0 0 0 0l-2.9-5q-0.2-0.4 0-0.7 0.2-0.3 0.5-0.3h4.3q0.7 0 1.2 0.8zM25.1 0.4q0.2 0.3 0 0.7l-9.4 16.7 6 11q0.2 0.4 0 0.6-0.2 0.3-0.6 0.3h-4.3q-0.7 0-1.2-0.8l-6-11.1q0.3-0.6 9.5-16.8 0.4-0.8 1.2-0.8h4.3q0.4 0 0.5 0.3z\"\/><\/svg><\/span><\/a><\/li><li class=\"shariff-button facebook\" style=\"background-color:#ccc\"><a href=\"https:\/\/www.facebook.com\/sharer\/sharer.php?u=https%3A%2F%2Fwww.defrent.de%2Ffr%2F2014%2F09%2Ffile-size-observations-on-the-iate-tbx-termbase%2F\" title=\"Envoyer par Facebook\" aria-label=\"Envoyer par Facebook\" role=\"button\" rel=\"nofollow\" class=\"shariff-link\" style=\"; background-color:#79b428; color:#fff\" target=\"_blank\"><span class=\"shariff-icon\" style=\"\"><svg width=\"32px\" height=\"20px\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" viewBox=\"0 0 18 32\"><path fill=\"#3b5998\" d=\"M17.1 0.2v4.7h-2.8q-1.5 0-2.1 0.6t-0.5 1.9v3.4h5.2l-0.7 5.3h-4.5v13.6h-5.5v-13.6h-4.5v-5.3h4.5v-3.9q0-3.3 1.9-5.2t5-1.8q2.6 0 4.1 0.2z\"\/><\/svg><\/span><\/a><\/li><li class=\"shariff-button twitter\" style=\"background-color:#ccc\"><a href=\"https:\/\/twitter.com\/share?url=https%3A%2F%2Fwww.defrent.de%2Ffr%2F2014%2F09%2Ffile-size-observations-on-the-iate-tbx-termbase%2F&text=File%20size%20observations%20on%20the%20IATE%20TBX%20Termbase\" title=\"Envoyer par X\" aria-label=\"Envoyer par X\" role=\"button\" rel=\"noopener nofollow\" class=\"shariff-link\" style=\"; background-color:#79b428; color:#fff\" target=\"_blank\"><span class=\"shariff-icon\" style=\"\"><svg width=\"32px\" height=\"20px\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" viewBox=\"0 0 24 24\"><path fill=\"#000\" d=\"M14.258 10.152L23.176 0h-2.113l-7.747 8.813L7.133 0H0l9.352 13.328L0 23.973h2.113l8.176-9.309 6.531 9.309h7.133zm-2.895 3.293l-.949-1.328L2.875 1.56h3.246l6.086 8.523.945 1.328 7.91 11.078h-3.246zm0 0\"\/><\/svg><\/span><\/a><\/li><\/ul><\/div><p>Is has been known for a while now that a database dump of IATE, the EU Terminology Database, has been made available as a <a title=\"External Link: Download IATE TBX\" href=\"http:\/\/iate.europa.eu\/tbxPageDownload.do\" target=\"_blank\" rel=\"noopener noreferrer\">download<\/a> instead of a <a title=\"External Link: IATE Search Form\" href=\"http:\/\/iate.europa.eu\/SearchByQueryLoad.do?method=load\" target=\"_blank\" rel=\"noopener noreferrer\">web search form<\/a> in June 2014. The ZIP file is ~116 MB, the unpacked database 2.2 GB (!) large. Since it contains all EU languages, I split this file into 4 subfiles, and extracted four trilingual DE\/FR\/EN files using an XSL transformation sheet. xsltproc.exe from Apache&rsquo;s <a title=\"External Link: Xerces on Apache.org\" href=\"http:\/\/xerces.apache.org\/\" target=\"_blank\" rel=\"noopener noreferrer\">Xerxes XML Parser<\/a> package couldn&rsquo;t cope with the complete file, but the four 550MB files passed through in about 10 minutes each and dropped to about half their original size.<\/p>\n<p><!--more--><\/p>\n<p>About 250-275MB per file is still quite fat, so I thought about ways to reduce this further. (Un-)fortunately, IATE isn&rsquo;t exactly renowned for its accuracy &#8211; colleagues in the know will always tell you to use IATE with caution. IATE has a \u00ab\u00a0Reliability\u00a0\u00bb rating which is assigned to each entry, running from 1 (unchecked) via 2 (minimal reliability) and 3 (reliable) to 4 (very reliable\/assessed). Thus, I was tempted to throw out all Reliability 1+2 entries and considered to also do away with Rel. 3 entires, since the IATE team itself <a title=\"External Link: IATE TBX Download Notice\" href=\"https:\/\/web.archive.org\/web\/20180628143406\/http:\/\/iate.europa.eu:80\/tbx\/IATE%20Data%20Fields%20Explaind.htm\" target=\"_blank\" rel=\"noopener noreferrer\">notes<\/a>:<\/p>\n<blockquote><p>This code was automatically assigned to many entries, regardless of their previous validation status, following the merger of existing databases to create IATE. Therefore some entries marked as \u2018reliable\u2019 are not necessarily so.<\/p><\/blockquote>\n<p>Uh-huh. So basically, all sorts of stuff was thrown in and instead of correctly classifying it as minimally reliable (Rel. 2) until the material could be reviewed, it was decided to recommend it as \u00ab\u00a0reliable\u00a0\u00bb (Rel. 3). That was the point at which I wrote two more XSL sheets to <strong>filter for Reliability 3+4 (R34) and exclusively for Reliablity 4 (R4)<\/strong>. Since that run looked promising, I wrote yet another XSLT script to <strong>clean up the results (C)<\/strong>, deleting empty language groups (\u00ab\u00a0tig\u00a0\u00bb elements) or even empty Term entries (\u00ab\u00a0termEntry\u00a0\u00bb elements). Here&rsquo;s what happened:<\/p>\n<table border=\"1\" width=\"100%\" cellspacing=\"0\" cellpadding=\"2\">\n<caption>IATE TBX File Size Reductions for DE\/FR\/EN<\/caption>\n<thead>\n<tr>\n<th bgcolor=\"#cccccc\" width=\"11%\"><strong>Filename<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>Orig. Size<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R34 Size<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R34 % from Orig<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R34 Cleaned Size<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R34C % from R34<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R34C % from Orig<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R4 Size<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R4 % from Orig<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R4 Cleaned Size<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R4C % from R4<\/strong><\/th>\n<th bgcolor=\"#cccccc\"><strong>R4C % from Orig<\/strong><\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>IATE-de-fr-en-1of4.tbx<\/td>\n<td><span style=\"color: #ff0000;\">273 MB<\/span><\/td>\n<td><span style=\"color: #ff6600;\">166 MB<\/span><\/td>\n<td>-39%<\/td>\n<td><span style=\"color: #008000;\">125 MB<\/span><\/td>\n<td>-25%<\/td>\n<td><strong>-54%<\/strong><\/td>\n<td><span style=\"color: #ff6600;\">113 MB<\/span><\/td>\n<td>-59%<\/td>\n<td><span style=\"color: #008000;\">57 MB<\/span><\/td>\n<td>-50%<\/td>\n<td><strong>-79%<\/strong><\/td>\n<\/tr>\n<tr>\n<td>IATE-de-fr-en-2of4.tbx<\/td>\n<td><span style=\"color: #ff0000;\">276 MB<\/span><\/td>\n<td><span style=\"color: #ff6600;\">233 MB<\/span><\/td>\n<td>-16%<\/td>\n<td><span style=\"color: #008000;\">212 MB<\/span><\/td>\n<td>-9,0%<\/td>\n<td><strong>-23%<\/strong><\/td>\n<td><span style=\"color: #ff6600;\">106 MB<\/span><\/td>\n<td>-62%<\/td>\n<td><span style=\"color: #008000;\">50 MB<\/span><\/td>\n<td>-53%<\/td>\n<td><strong>-82%<\/strong><\/td>\n<\/tr>\n<tr>\n<td>IATE-de-fr-en-3of4.tbx<\/td>\n<td><span style=\"color: #ff0000;\">253 MB<\/span><\/td>\n<td><span style=\"color: #ff6600;\">213 MB<\/span><\/td>\n<td>-16%<\/td>\n<td><span style=\"color: #008000;\">192 MB<\/span><\/td>\n<td>-10%<\/td>\n<td><strong>-24%<\/strong><\/td>\n<td><span style=\"color: #ff6600;\">100 MB<\/span><\/td>\n<td>-61%<\/td>\n<td><span style=\"color: #008000;\">46 MB<\/span><\/td>\n<td>-54%<\/td>\n<td><strong>-82%<\/strong><\/td>\n<\/tr>\n<tr>\n<td>IATE-de-fr-en-4of4.tbx<\/td>\n<td><span style=\"color: #ff0000;\">271 MB<\/span><\/td>\n<td><span style=\"color: #ff6600;\">245 MB<\/span><\/td>\n<td>-10%<\/td>\n<td><span style=\"color: #008000;\">231 MB<\/span><\/td>\n<td>-6%<\/td>\n<td><strong>-15%<\/strong><\/td>\n<td><span style=\"color: #ff6600;\">107 MB<\/span><\/td>\n<td>-61%<\/td>\n<td><span style=\"color: #008000;\">56 MB<\/span><\/td>\n<td>-48%<\/td>\n<td><strong>-79%<\/strong><\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p><strong>Now, what does this <em>mean<\/em>?<\/strong><\/p>\n<p>Apparently, German, English and French make up roughly 50% of the whole IATE database. This isn&rsquo;t astonishing, as DE, FR and EN are the OFFICIAL official languages of the EU (that means, all documents must be made available in at least one of these three languages). But it also means that on average, 80% of the chosen DE\/FR\/EN data subset are classed as \u00ab\u00a0reliable or very reliable\u00a0\u00bb and still almost 40% as \u00ab\u00a0very reliable\u00a0\u00bb.<\/p>\n<p>Additionally, this means that by cutting out all unreliable entries and all the unnecessary bits (empty tags, superfluous whitespace, etc.), we can achieve significant file size reductions. This plays an important role during import of the TBX database into other systems, notably SDL Trados Studio&rsquo;s beloved companion, SDL MultiTerm, which didn&rsquo;t manage to import the original DE-FR-EN files without lots of \u00ab\u00a0file lock limit\u00a0\u00bb errors. More on that in another post, perhaps, but Paul Filkon already wrote on that in <a title=\"External Link: Multifarious - What A Whopper\" href=\"http:\/\/multifarious.filkin.com\/2014\/07\/13\/what-a-whopper\/\" target=\"_blank\" rel=\"noopener noreferrer\">What A Whopper<\/a>. The message is: \u00ab\u00a0<strong>Don&rsquo;t use IATE as-is, adapt it to your needs!<\/strong>\u00a0\u00bb For example, one could further filter IATE by the \u00ab\u00a0field\u00a0\u00bb column to adapt it to one&rsquo;s own expert fields as a translator.<\/p>\n<p>If you are interested in the XSL transformation sheets used, you can download them as a <a title=\"DeFrEnT: IATE XSL Transformations (3kB ZIP)\" href=\"https:\/\/www.defrent.de\/wp-content\/uploads\/2014\/09\/DeFrEnT_IATE-XSL-Transformations.zip\">3kB ZIP file<\/a>. If you don&rsquo;t know anything about XML\/XSL, but would like to have a look at the resulting varieties of DE-FR-EN TBX files, send me a nice-to-read e-mail to <em>info ~at~ defrent ~dot~ de<\/em> (no \u00ab\u00a0mee too!\u00a0\u00bb blog comments, please). The \u00ab\u00a0unedited TBX\u00a0\u00bb ZIP file weighs in at ~55MB, the filtered Reliability 3+4 ZIP is ~37MB and the Reliability 4 ZIP is only ~7.5 MB. Since the resulting SDL MultiTerm termbases are 5 times as heavy as the corresponding TBX file, I am reluctant to send out those, but with the free MultiTerm Convert tool from the SDL OpenExchange, conversion should be a matter of minutes. Of course, the IATE usage conditions from their download site apply to the edited files, too:<\/p>\n<blockquote><p>You are allowed to reproduce the data provided on this page for your personal needs, to distribute it for non-commercial and commercial purposes, and to make and distribute derivative works, provided the source is acknowledged as follows: Download IATE, European Union, 2014.<\/p><\/blockquote>\n<p><span style=\"color:red;\">Edit (1st Oct. 2014):<\/span> @jeromobot recommended <a href=\"http:\/\/multifarious.filkin.com\/2014\/09\/30\/iate-the-last-word-maybe\/\" title=\"External Link: Paul Filkin\u00b4s Blog Multifarious - IATE the last word\" target=\"_blank\" rel=\"noopener noreferrer\">Paul Filkin&rsquo;s recommendation<\/a>, which I will repeat here in short: If you are looking for more thoroughly cleaned IATE files that are ready for import into your CAT, you might want to visit <a title=\"External Link: santrans.net\" href=\"http:\/\/santrans.net\/\" target=\"_blank\" rel=\"noopener noreferrer\">Henk Sanderson&rsquo;s site SanTrans<\/a>, where he also mentions addditional IATE pitfalls, like terms-that-aren&rsquo;t and escaped (pseudo-)HTML codes like &amp;lt;i&amp;gt;some term&amp;lt;\/i&amp;gt; inside entries.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Is has been known for a while now that a database dump of IATE, the EU Terminology Database, has been made available as a download instead of a web search form in June 2014. The ZIP file is ~116 MB, the unpacked database 2.2 GB (!) large. Since it contains all EU languages, I split<\/p><\/div>\n<div class=\"blog-btn\"><a href=\"https:\/\/www.defrent.de\/fr\/2014\/09\/file-size-observations-on-the-iate-tbx-termbase\/\" class=\"home-blog-btn\">Lire la suite<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"activitypub_content_warning":"","activitypub_content_visibility":"","activitypub_max_image_attachments":4,"activitypub_interaction_policy_quote":"anyone","activitypub_status":"","footnotes":""},"categories":[21,23],"tags":[49,73,77,82,83,84],"class_list":["post-313","post","type-post","status-publish","format-standard","hentry","category-de","category-howto-de","tag-iate","tag-tbx","tag-trados-studio","tag-xml","tag-xsl","tag-xslt"],"aioseo_notices":[],"featured_image_src":null,"featured_image_src_square":null,"author_info":{"display_name":"Christopher K\u00f6bel","author_link":"https:\/\/www.defrent.de\/fr\/author\/defrenter\/"},"_links":{"self":[{"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/posts\/313","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/comments?post=313"}],"version-history":[{"count":3,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/posts\/313\/revisions"}],"predecessor-version":[{"id":1194,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/posts\/313\/revisions\/1194"}],"wp:attachment":[{"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/media?parent=313"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/categories?post=313"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.defrent.de\/fr\/wp-json\/wp\/v2\/tags?post=313"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}