Viewing Paste #308

  • Formated Text - v

    1. <?php
    2.  
    3. $filename = substr($_SERVER['argv'][1], 11);
    4. $lines = file($filename);
    5.  
    6. // Are we in an h4 tag?
    7. $in_section = false;
    8. // h4?
    9. $in_sub_section = false;
    10. // Title for the section <h3>
    11. $title = '';
    12. // <h4> title
    13. $sub_title = '';
    14. // In the html body?
    15. $in_html_body = false;
    16. // References???
    17. $found_reference = false;
    18. $in_list = false;
    19. // Tables?
    20. $tables_count = 0;
    21.  
    22. $front = array(
    23. 'article_id' => 0,
    24. 'publication' => 0,
    25. 'column' => '',
    26. 'volume' => '',
    27. 'issue' => '',
    28. 'pages' => '',
    29. 'date' => array(
    30. 'day' => date('d'),
    31. 'month' => date('m'),
    32. 'year' => date('Y'),
    33. ),
    34. 'pub_abbr' => '',
    35. );
    36. $back = array(
    37. 'references' => array(),
    38. 'ack' => '',
    39. 'financial' => '',
    40. 'conflict' => '',
    41. 'abbr' => '',
    42. 'reprint' => '',
    43. 'sidebar' => array(),
    44. );
    45.  
    46. $sections = array();
    47. foreach ($lines as $key => $line)
    48. {
    49. // Clean spaces.
    50. $line = trim($line);
    51.  
    52. // No blank lines.
    53. if (empty($line))
    54. continue;
    55.  
    56. // Skip the head matter.
    57. if (!$in_html_body && $line != '<body>')
    58. continue;
    59. elseif (!$in_html_body && $line == '<body>')
    60. {
    61. $in_html_body = true;
    62. continue;
    63. }
    64.  
    65. // We just end here. We don't parse the table and image stuff just yet.
    66. if (in_array($line, array('</body>', '</html>')))
    67. break;
    68.  
    69. // We continue with these.
    70. if (in_array($line, array('<!--', '-->')))
    71. continue;
    72.  
    73. // Lets just do the head stuff first since it's usually on top.
    74. if (substr($line, 0, 11) == 'Article ID:')
    75. {
    76. $front['article_id'] = trim(substr($line, 11));
    77. continue;
    78. }
    79. elseif (substr($line, 0, 15) == 'Publication ID:')
    80. {
    81. $front['publication'] = trim(substr($line, 15));
    82. continue;
    83. }
    84. elseif (substr($line, 0, 18) == 'Section/Column ID:')
    85. {
    86. $front['column'] = trim(substr($line, 18));
    87. continue;
    88. }
    89. elseif (substr($line, 0, 7) == 'Volume:')
    90. {
    91. $front['volume'] = trim(substr($line, 7));
    92. continue;
    93. }
    94. elseif (substr($line, 0, 6) == 'Issue:')
    95. {
    96. $front['issue'] = trim(substr($line, 6));
    97. continue;
    98. }
    99. elseif (substr($line, 0, 6) == 'Pages:')
    100. {
    101. $front['pages'] = trim(substr($line, 6));
    102. continue;
    103. }
    104. elseif (substr($line, 0, 9) == 'Pub Date:')
    105. {
    106. list ($month, $day, $year) = explode('/', trim(substr($line, 9)));
    107. $front['date']['month'] = empty($month) ? date('m') : $month;
    108. $front['date']['day'] = empty($day) ? date('d') : $day;
    109. $front['date']['year'] = empty($year) ? date('Y') : $year;
    110. continue;
    111. }
    112. elseif (substr($line, 0, 8) == 'Surname:')
    113. {
    114. $front['surname'] = trim(substr($line, 8));
    115. continue;
    116. }
    117. elseif (substr($line, 0, 11) == 'Disclosure:')
    118. {
    119. $front['disclosure'] = trim(substr($line, 11));
    120. continue;
    121. }
    122. elseif (substr($line, 0, 4) == 'Bio:')
    123. {
    124. $front['bio'] = trim(substr($line, 4));
    125. continue;
    126. }
    127. elseif (substr($line, 0, 7) == 'Images:')
    128. {
    129. $front['has_images'] = trim(substr($line, 7)) == 'true' ? true : false;
    130. continue;
    131. }
    132. elseif (substr($line, 0, 9) == 'Pub Abbr:')
    133. {
    134. $front['pub_abbr'] = trim(substr($line, 9));
    135. continue;
    136. }
    137. elseif (substr($line, 0, 4) == 'Ack:')
    138. {
    139. $back['ack'] = trim(substr($line, 4));
    140. continue;
    141. }
    142. elseif (substr($line, 0, 10) == 'Financial:' || substr($line, 0, 8) == 'Funding:')
    143. {
    144. $back['financial'] = trim(substr($line, substr($line, 0, 10) == 'Financial:' ? 10 : 8));
    145. continue;
    146. }
    147. elseif (substr($line, 0, 9) == 'Conflict:')
    148. {
    149. $back['conflict'] = trim(substr($line, 9));
    150. continue;
    151. }
    152. elseif (substr($line, 0, 5) == 'Abbr:')
    153. {
    154. $back['abbr'] = trim(substr($line, 5));
    155. continue;
    156. }
    157. elseif (substr($line, 0, 8) == 'Reprint:')
    158. {
    159. $back['reprint'] = trim(substr($line, 8));
    160. continue;
    161. }
    162.  
    163. // References here.
    164. if (trim($line) == '<h3>References</h3>')
    165. {
    166. $found_reference = true;
    167. continue;
    168. }
    169.  
    170. // In a first level section and we already in one. Just go to the new one.
    171. if ($in_section && !$found_reference && substr($line, 0, 3) == '<h3')
    172. {
    173. $in_section = false;
    174. $in_sub_section = false;
    175. $title = '';
    176. $sub_title = '';
    177. $section_type = '';
    178. }
    179.  
    180. // A section?
    181. if (!$in_section && !$found_reference && preg_match('~^<h3(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h3>$~', $line, $matches))
    182. {
    183. $in_section = true;
    184. $title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);
    185.  
    186. // Clean title.
    187. $clean_title = strtolower(substr($title, 0, 50));
    188.  
    189. // We start a new section in the array.
    190. $sections[$clean_title] = array(
    191. 'title' => $title == '.' ? '' : $title,
    192. 'sub_section' => array(),
    193. );
    194.  
    195. if (count($matches) == 3)
    196. $sections[$clean_title]['type'] = trim($matches[1]);
    197.  
    198. // NEXT...
    199. continue;
    200. }
    201.  
    202. // First lets check if we even doing references yet then check for list.
    203. if ($found_reference && ($line == '<ol>' || $line == '<ul>'))
    204. {
    205. $in_list = true;
    206. $reference_ordered = $line == '<ol>' ? true : false;
    207. continue;
    208. }
    209.  
    210. if ($in_list && $found_reference && substr(trim($line), 0, 4) == '<li>')
    211. {
    212. $back['references'][] = htmlspecialchars_uni(substr(trim($line), 4, -5));
    213. continue;
    214. }
    215.  
    216. if ($in_list && $found_reference && ($line == '</ol>' || $line == '</ul>'))
    217. {
    218. $found_reference = false;
    219. $in_list = false;
    220. continue;
    221. }
    222.  
    223. // In a sub section and another one just pops up? We no longer in the other one then. One section at a time.
    224. if ($in_section && $in_sub_section && substr($line, 0, 3) == '<h4')
    225. $in_sub_section = false;
    226.  
    227. // Sub section.
    228. if ($in_section && !$in_sub_section && preg_match('~^<h4(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h4>$~', $line, $matches))
    229. {
    230. $in_sub_section = true;
    231. $sub_title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);
    232.  
    233. // Clean sub title.
    234. $clean_sub_title = strtolower(substr($sub_title, 0, 50));
    235.  
    236. // Type?
    237. $section_type = '';
    238. if (count($matches) == 3)
    239. $section_type = trim($matches[1]);
    240.  
    241. // We don't need any more data from this line so... NEXT!!!
    242. continue;
    243. }
    244. elseif ($in_section && !$in_sub_section)
    245. {
    246. $in_sub_section = true;
    247. $clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
    248. }
    249.  
    250. if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
    251. $sections[$clean_title]['sub_section'][$clean_sub_title] = array(
    252. 'title' => $sub_title,
    253. 'type' => $section_type,
    254. 'data' => array(),
    255. );
    256.  
    257. // Now for the data.
    258. $sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
    259. }
    260.  
    261. $body = '
    262. <body>';
    263.  
    264. foreach ($sections as $id => $section)
    265. {
    266. // Sidebar? They go in the back.
    267. if (isset($section['type']) && $section['type'] == 'sidebar')
    268. {
    269. $back['sidebar'][] = $section;
    270. continue;
    271. }
    272.  
    273. // First level? Then it's a page.
    274. $body .= '
    275. <sec sec-type="page">
    276. ' . (empty($section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($section['title']) . '</title>');
    277.  
    278. // Ok we move on grass hoppa.
    279. foreach ($section['sub_section'] as $name => $sub_section)
    280. {
    281. $body .= '
    282. <sec sec-type="' . (empty($sub_section['type']) ? 'content' : $sub_section['type']) . '">
    283. ' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($sub_section['title']) . '</title>');
    284.  
    285. // Now the data.
    286. $body .= parse_content($sub_section['data']);
    287.  
    288. $body .= '
    289. </sec>';
    290. }
    291.  
    292. $body .= '
    293. </sec>';
    294. }
    295.  
    296. $body .= '
    297. </body>';
    298.  
    299. // Generate the front section
    300. $front_xml = generate_front($front);
    301. // References
    302. $back_xml = generate_back($back);
    303.  
    304. $xml = '<?xml version="1.0" encoding="UTF-8"?>
    305. <!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
    306. <article>' . $front_xml . $body . $back_xml . '
    307. </article>';
    308.  
    309. // Emails
    310. $xml = preg_replace('~([0-9A-Za-z=_+\-/][0-9A-Za-z=_\'+\-/\.]*@[\w\-]+(\.[\w\-]+)*(\.[\w]{2,6}))~', '<ext-link ext-link-type="mailto" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="\1" xlink:title="\1"/>', $xml);
    311.  
    312. echo $xml;
    313.  
    314. $fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
    315. if (fwrite($fp, $xml))
    316. echo "\nWROTE AND SAVED FILE";
    317. fclose($fp);
    318.  
    319. function xml_replacement($data)
    320. {
    321. global $front;
    322.  
    323. $data = preg_replace('~(Table|Exhibit|Box) (\d+)~', '<xref ref-type="table" rid="T\2">\1 \2</xref>', $data);
    324.  
    325. // Images now?
    326. if (isset($front['pub_abbr']) && preg_match('~\[\[(InlineFigure|Figure)\|\^\|\d+\|\^\|.+?\]\]~', $data, $matches))
    327. {
    328. if (trim($matches[1]) == 'InlineFigure')
    329. $data = preg_replace('~\[\[InlineFigure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
    330. <inline-graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-" . $front['pub_abbr'] . $front['article_id'] . ".equation$1.gif\">
    331. <alt-text>Equation</alt-text>
    332. </inline-graphic>
    333. ", $data);
    334. else
    335. $data = preg_replace('~\[\[Figure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
    336. <fig id=\"F\$1\">
    337. <label>Figure \$1.</label>
    338. <caption>