<?php
$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);
// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
// References???
$found_reference = false;
$in_list = false;
$front = array(
'article_id' => 0,
'publication' => 0,
'column' => '',
'volume' => '',
'issue' => '',
'pages' => '',
'date' => array(
'day' => date('d'),
'month' => date('m'),
'year' => date('Y'),
)
);
$back = array(
'references' => array(),
'ack' => '',
);
$sections = array();
foreach ($lines as $key => $line)
{
// Clean spaces.
$line = trim($line);
// No blank lines.
if (empty($line))
continue;
if (!$in_html_body && $line != '<body>')
continue;
elseif (!$in_html_body && $line == '<body>')
{
$in_html_body = true;
continue;
}
// We just end here. We don't parse the table and image stuff just yet.
if (in_array($line, array('</body>', '</html>')))
break;
// We continue with these.
if (in_array($line, array('<!--', '-->')))
continue;
// Lets just do the head stuff first since it's usually on top.
if (substr($line, 0, 11) == 'Article ID:')
{
$front['article_id'] = trim(substr($line, 11));
continue;
}
elseif (substr($line, 0, 15) == 'Publication ID:')
{
$front['publication'] = trim(substr($line, 15));
continue;
}
elseif (substr($line, 0, 18) == 'Section/Column ID:')
{
$front['column'] = trim(substr($line, 18));
continue;
}
elseif (substr($line, 0, 7) == 'Volume:')
{
$front['volume'] = trim(substr($line, 7));
continue;
}
elseif (substr($line, 0, 6) == 'Issue:')
{
$front['issue'] = trim(substr($line, 6));
continue;
}
elseif (substr($line, 0, 6) == 'Pages:')
{
$front['pages'] = trim(substr($line, 6));
continue;
}
elseif (substr($line, 0, 9) == 'Pub Date:')
{
list ($month, $day, $year) = explode('/', trim(substr($line, 9)));
$front['date']['month'] = empty($month) ? date('m') : $month;
$front['date']['day'] = empty($day) ? date('d') : $day;
$front['date']['year'] = empty($year) ? date('Y') : $year;
continue;
}
elseif (substr($line, 0, 8) == 'Surname:')
{
$front['surname'] = trim(substr($line, 8));
continue;
}
elseif (substr($line, 0, 11) == 'Disclosure:')
{
$front['disclosure'] = trim(substr($line, 11));
continue;
}
elseif (substr($line, 0, 4) == 'Bio:')
{
$front['bio'] = trim(substr($line, 4));
continue;
}
elseif (substr($line, 0, 4) == 'Ack:')
{
$back['ack'] = trim(substr($line, 4));
continue;
}
// References here.
if (trim($line) == '<h3>References</h3>')
{
$found_reference = true;
continue;
}
// In a first level section and we already in one. Just go to the new one.
if ($in_section && !$found_reference && substr($line, 0, 4) == '<h3>')
{
$in_section = false;
$in_sub_section = false;
$title = '';
$sub_title = '';
}
// A section?
if (!$in_section && !$found_reference && preg_match('~^<h3>(.+?)</h3>$~', $line, $matches))
{
$in_section = true;
$title = trim($matches[1]);
// Clean title.
$clean_title = strtolower(substr($title, 0, 50));
// We start a new section in the array.
$sections[$clean_title] = array(
'title' => $title,
'sub_section' => array(),
);
// NEXT...
continue;
}
// First lets check if we even doing references yet then check for list.
if ($found_reference && trim($line) == '<ol>')
{
$in_list = true;
continue;
}
if ($in_list && $found_reference && substr(trim($line), 0, 4) == '<li>')
{
$back['references'][] = htmlspecialchars(substr(trim($line), 4, -5));
continue;
}
if ($in_list && $found_reference && trim($line) == '</ol>')
{
$found_reference = false;
$in_list = false;
continue;
}
// In a sub section and another one just pops up? We no longer in the other one then. One section at a time.
if ($in_section && $in_sub_section && substr($line, 0, 4) == '<h4>')
$in_sub_section = false;
// Sub section.
if ($in_section && !$in_sub_section && preg_match('~^<h4>(.+?)</h4>$~', $line, $matches))
{
$in_sub_section = true;
$sub_title = trim($matches[1]);
// Clean sub title.
$clean_sub_title = strtolower(substr($sub_title, 0, 50));
// We don't need any more data from this line so... NEXT!!!
continue;
}
elseif ($in_section && !$in_sub_section)
{
$in_sub_section = true;
$clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
}
if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
$sections[$clean_title]['sub_section'][$clean_sub_title] = array(
'title' => $sub_title,
'data' => array(),
);
// Now for the data.
$sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}
$body = '
<body>';
foreach ($sections as $id => $section)
{
// First level? Then it's a page.
$body .= '
<sec sec-type="page">
<title>' . htmlspecialchars($section['title']) . '</title>';
// Ok we move on grass hoppa.
foreach ($section['sub_section'] as $name => $sub_section)
{
$body .= '
<sec sec-type="content">
' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars($sub_section['title']) . '</title>');
// Now the data.
foreach ($sub_section['data'] as $key => $data)
{
// Do we have any reference to a table here?
// Bullet list?
if ($data == '<ul>')
$body .= '
<list list-type="bullet">';
// How about ordered?
elseif ($data == '<ol>')
$body .= '
<list list-type="order">';
// List items?
elseif (substr($data, 0, 4) == '<li>')
$body .= '
<list-item><p>' . xml_replacement(htmlspecialchars(substr($data, 4, -5))) . '</p></list-item>';
// Closing them?
elseif ($data == '</ul>' || $data == '</ol>')
$body .= '
</list>';
elseif ($data == '<p>')
{
$body .= '
<p>';
$open_p = true;
}
elseif ($data == '</p>')
{
$body .= '
</p>';
$open_p = false;
}
elseif ($open_p)
$body .= '
' . xml_replacement(htmlspecialchars($data));
elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
$body .= '
<p>' . xml_replacement(htmlspecialchars($matches[1])) . '</p>';
else
$body .= '
' . xml_replacement(htmlspecialchars($data));
}
$body .= '
</sec>';
}
$body .= '
</sec>';
}
$body .= '
</body>';
// Generate the front section
$front_xml = generate_front($front);
// References
$back_xml = generate_back($back);
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
<article>' . $front_xml . $body . $back_xml . '
</article>';
echo $xml;
$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
echo "\nWROTE AND SAVED FILE";
fclose($fp);
function xml_replacement($data)
{
if (preg_match('~Table \d+~', $data))
$data = preg_replace('~Table (\d+)~', '<xref ref-type="table" rid="T\1">Table \1</xref>', $data);
return $data;
}
function generate_front($front)
{
$front_xml = '
<front>
<journal-meta>
<journal-id journal-id-type="publication">768</journal-id>
<issn/>
<publisher>
<publisher-name/>
</publisher>
</journal-meta>
<article-meta>
<article-id>' . (isset($front['article_id']) ? (int) $front['article_id'] : '') . '</article-id>
<article-categories>
<subj-group>
<!-- In the <subject> field, enter ONLY ONE of the following, spelled EXACTLY as follows: recResource; abstract; clinicalReference; faq; interactiveCase; journalArticle; journalScan; news; profAskTheExpert -->
<subject>journalArticle</subject>
</subj-group>
<series-title></series-title>' . (isset($front['column']) ? '
<series-title series-type="column">' . $front['column'] . '</series-title>' : '') . '
</article-categories>
<title-group>
<article-title></article-title>
<subtitle></subtitle>
</title-group>
<contrib-group>
<contrib author-id="0" contrib-type="author">
<name>
<!-- Author names go within the <surname> tags, separated by semicolons -->
<surname>' . (isset($front['surname']) ? $front['surname'] : '') . '</surname>
</name>
<bio>
<!-- Author Bio text goes within the <p>tags -->
<p>' . (isset($front['bio']) ? htmlspecialchars($front['bio']) : '') . '</p>
</bio>
<role>Additional Author</role>
<author-comment>
<!-- Author Disclosure text goes within the <p>tags -->
<p>' . (isset($front['disclosure']) ? htmlspecialchars($front['disclosure']) : '') . '</p>
</author-comment>
</contrib>
</contrib-group>
<pub-date>
<day>' . (isset($front['date']['day']) ? sprintf("%02d", (int) $front['date']['day']) : '') . '</day>
<month>' . (isset($front['date']['month']) ? sprintf("%02d", (int) $front['date']['month']) : '') . '</month>
<year>' . (isset($front['date']['year']) ? (int) $front['date']['year'] : '') . '</year>
</pub-date>
<volume>' . (!empty($front['volume']) ? (int) $front['volume'] : '') . '</volume>
<issue>' . (!empty($front['issue']) ? (int) $front['issue'] : '') . '</issue>
<fpage>' . (isset($front['pages']) ? $front['pages'] : '') . '</fpage>
<copyright-year/>
<abstract>
<title>Abstract</title>
<p></p>
</abstract>
</article-meta>
</front>';
return $front_xml;
}
function generate_references($references)
{
if (empty($references))
return '';
$xml = '
<ref-list>
<title>References</title>
<list list-type="order">';
foreach ($references as $reference)
$xml .= '
<list-item><p>' . $reference . '</p></list-item>';
$xml .= '
</list>
</ref-list>';
return $xml;
}
function generate_back($back)
{
$xml = '
<back>';
// References
if (!empty($back['references']))
$xml .= generate_references($back['references']);
// Ack
if (!empty($back['ack']))
$xml .= '
<ack>
<title>Acknowledgements</title>
<p>' . htmlspecialchars($back['ack']) . '</p>
</ack>';
$xml = '
</back>';
return $xml;
}
?>