<?php
$filename =
substr($_SERVER['argv'][1],
11);
$lines =
file($filename);
// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
// References???
$found_reference = false;
$in_list = false;
// Tables?
$tables_count = 0;
'article_id' => 0,
'publication' => 0,
'column' => '',
'volume' => '',
'issue' => '',
'pages' => '',
),
'pub_abbr' => '',
);
'ack' => '',
'financial' => '',
'conflict' => '',
'abbr' => '',
'reprint' => '',
);
foreach ($lines as $key => $line)
{
// Clean spaces.
// No blank lines.
continue;
// Skip the head matter.
if (!$in_html_body && $line != '<body>')
continue;
elseif (!$in_html_body && $line == '<body>')
{
$in_html_body = true;
continue;
}
// We just end here. We don't parse the table and image stuff just yet.
break;
// We continue with these.
continue;
// Lets just do the head stuff first since it's usually on top.
if (substr($line,
0,
11) ==
'Article ID:') {
continue;
}
elseif (substr($line,
0,
15) ==
'Publication ID:') {
continue;
}
elseif (substr($line,
0,
18) ==
'Section/Column ID:') {
continue;
}
elseif (substr($line,
0,
7) ==
'Volume:') {
continue;
}
elseif (substr($line,
0,
6) ==
'Issue:') {
continue;
}
elseif (substr($line,
0,
6) ==
'Pages:') {
continue;
}
elseif (substr($line,
0,
9) ==
'Pub Date:') {
$front['date']['month'] =
empty($month) ?
date('m') :
$month;
$front['date']['day'] =
empty($day) ?
date('d') :
$day;
$front['date']['year'] =
empty($year) ?
date('Y') :
$year;
continue;
}
elseif (substr($line,
0,
8) ==
'Surname:') {
continue;
}
elseif (substr($line,
0,
11) ==
'Disclosure:') {
continue;
}
elseif (substr($line,
0,
4) ==
'Bio:') {
continue;
}
elseif (substr($line,
0,
7) ==
'Images:') {
$front['has_images'] =
trim(substr($line,
7)) ==
'true' ?
true :
false;
continue;
}
elseif (substr($line,
0,
9) ==
'Pub Abbr:') {
continue;
}
elseif (substr($line,
0,
4) ==
'Ack:') {
continue;
}
elseif (substr($line,
0,
10) ==
'Financial:' ||
substr($line,
0,
8) ==
'Funding:') {
$back['financial'] =
trim(substr($line,
substr($line,
0,
10) ==
'Financial:' ?
10 :
8));
continue;
}
elseif (substr($line,
0,
9) ==
'Conflict:') {
continue;
}
elseif (substr($line,
0,
5) ==
'Abbr:') {
continue;
}
elseif (substr($line,
0,
8) ==
'Reprint:') {
continue;
}
// References here.
if (trim($line) ==
'<h3>References</h3>') {
$found_reference = true;
continue;
}
// In a first level section and we already in one. Just go to the new one.
if ($in_section && !
$found_reference &&
substr($line,
0,
3) ==
'<h3') {
$in_section = false;
$in_sub_section = false;
$title = '';
$sub_title = '';
$section_type = '';
}
// A section?
if (!
$in_section && !
$found_reference &&
preg_match('~^<h3(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h3>$~',
$line,
$matches)) {
$in_section = true;
$title =
count($matches) ==
2 ?
trim($matches[1]) :
trim($matches[2]);
// Clean title.
// We start a new section in the array.
$sections[$clean_title] =
array( 'title' => $title == '.' ? '' : $title,
'sub_section' =>
array(),
);
if (count($matches) ==
3) $sections[$clean_title]['type'] =
trim($matches[1]);
// NEXT...
continue;
}
// First lets check if we even doing references yet then check for list.
if ($found_reference && ($line == '<ol>' || $line == '<ul>'))
{
$in_list = true;
$reference_ordered = $line == '<ol>' ? true : false;
continue;
}
if ($in_list &&
$found_reference &&
substr(trim($line),
0,
4) ==
'<li>') {
$back['references'][] = htmlspecialchars_uni
(substr(trim($line),
4,
-5));
continue;
}
if ($in_list && $found_reference && ($line == '</ol>' || $line == '</ul>'))
{
$found_reference = false;
$in_list = false;
continue;
}
// In a sub section and another one just pops up? We no longer in the other one then. One section at a time.
if ($in_section &&
$in_sub_section &&
substr($line,
0,
3) ==
'<h4') $in_sub_section = false;
// Sub section.
if ($in_section && !
$in_sub_section &&
preg_match('~^<h4(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h4>$~',
$line,
$matches)) {
$in_sub_section = true;
$sub_title =
count($matches) ==
2 ?
trim($matches[1]) :
trim($matches[2]);
// Clean sub title.
// Type?
$section_type = '';
if (count($matches) ==
3) $section_type =
trim($matches[1]);
// We don't need any more data from this line so... NEXT!!!
continue;
}
elseif ($in_section && !$in_sub_section)
{
$in_sub_section = true;
}
if (isset($sections[$clean_title]) && !
isset($sections[$clean_title]['sub_section'][$clean_sub_title])) $sections[$clean_title]['sub_section'][$clean_sub_title] =
array( 'title' => $sub_title,
'type' => $section_type,
);
// Now for the data.
$sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}
$body = '
<body>';
foreach ($sections as $id => $section)
{
// Sidebar? They go in the back.
if (isset($section['type']) &&
$section['type'] ==
'sidebar') {
$back['sidebar'][] = $section;
continue;
}
// First level? Then it's a page.
$body .= '
<sec sec-type="page">
' .
(empty($section['title']) ?
'<title/>' :
'<title>' . htmlspecialchars_uni
($section['title']) .
'</title>');
// Ok we move on grass hoppa.
foreach ($section['sub_section'] as $name => $sub_section)
{
$body .= '
<sec sec-type="' .
(empty($sub_section['type']) ?
'content' :
$sub_section['type']) .
'"> ' .
(empty($sub_section['title']) ?
'<title/>' :
'<title>' . htmlspecialchars_uni
($sub_section['title']) .
'</title>');
// Now the data.
$body .= parse_content($sub_section['data']);
$body .= '
</sec>';
}
$body .= '
</sec>';
}
$body .= '
</body>';
// Generate the front section
$front_xml = generate_front($front);
// References
$back_xml = generate_back($back);
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
<article>' . $front_xml . $body . $back_xml . '
</article>';
// Emails
$xml =
preg_replace('~([0-9A-Za-z=_+\-/][0-9A-Za-z=_\'+\-/\.]*@[\w\-]+(\.[\w\-]+)*(\.[\w]{2,6}))~',
'<ext-link ext-link-type="mailto" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="\1" xlink:title="\1"/>',
$xml);
echo "\nWROTE AND SAVED FILE";
function xml_replacement($data)
{
$data =
preg_replace('~(Table|Exhibit|Box) (\d+)~',
'<xref ref-type="table" rid="T\2">\1 \2</xref>',
$data);
// Images now?
if (isset($front['pub_abbr']) &&
preg_match('~\[\[(InlineFigure|Figure)\|\^\|\d+\|\^\|.+?\]\]~',
$data,
$matches)) {
if (trim($matches[1]) ==
'InlineFigure') $data =
preg_replace('~\[\[InlineFigure\|\^\|(\d+)\|\^\|(.+?)\]\]~',
" <inline-graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" xlink:href=\"/images/" .
substr($front['article_id'],
0,
3) .
"/" .
substr($front['article_id'],
-3) .
"/art-" .
$front['pub_abbr'] .
$front['article_id'] .
".equation$1.gif\"> <alt-text>Equation</alt-text>
</inline-graphic>
", $data);
else
$data =
preg_replace('~\[\[Figure\|\^\|(\d+)\|\^\|(.+?)\]\]~',
" <fig id=\"F\$1\">
<label>Figure \$1.</label>
<caption>