<?php
$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);
// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
$front = array();
$sections = array();
foreach ($lines as $key => $line)
{
// Clean spaces.
$line = trim($line);
// No blank lines.
if (empty($line))
continue;
if (!$in_html_body && $line != '<body>')
continue;
elseif (!$in_html_body && $line == '<body>')
{
$in_html_body = true;
continue;
}
// We just end here. We don't parse the table and image stuff just yet.
if (in_array($line, array('</body>', '</html>')))
break;
// Lets just do the head stuff first since it's usually on top.
if (substr($line, 0, 11) == 'Article ID:')
{
$front['article_id'] = trim(substr($line, 11));
continue;
}
elseif (substr($line, 0, 12) == 'Publication:')
{
$front['publication'] = tirm(substr($line, 12));
continue;
}
// In a first level section and we already in one. Just go to the new one.
if ($in_section && substr($line, 0, 4) == '<h3>')
{
$in_section = false;
$in_sub_section = false;
$title = '';
$sub_title = '';
}
// A section?
if (preg_match('~^<h3>(.+?)</h3>$~', $line, $matches) && $in_section == false)
{
$in_section = true;
$title = trim($matches[1]);
// Clean title.
$clean_title = strtolower(substr($title, 0, 50));
// We start a new section in the array.
$sections[$clean_title] = array(
'title' => $title,
'sub_section' => array(),
);
// NEXT...
continue;
}
// In a sub section and another one just pops up? We no longer in the other one then. One section at a time.
if ($in_section && $in_sub_section && substr($line, 0, 4) == '<h4>')
$in_sub_section = false;
// Sub section.
if ($in_section && !$in_sub_section && preg_match('~^<h4>(.+?)</h4>$~', $line, $matches))
{
$in_sub_section = true;
$sub_title = trim($matches[1]);
// Clean sub title.
$clean_sub_title = strtolower(substr($sub_title, 0, 50));
// We don't need any more data from this line so... NEXT!!!
continue;
}
elseif ($in_section && !$in_sub_section)
{
$in_sub_section = true;
$clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
}
if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
$sections[$clean_title]['sub_section'][$clean_sub_title] = array(
'title' => $sub_title,
'data' => array(),
);
// Now for the data.
$sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}
print_r($sections);
$xml = '
<body>';
foreach ($sections as $id => $section)
{
// First level? Then it's a page.
$xml .= '
<sec sec-type="page">
<title>' . htmlspecialchars($section['title']) . '</title>';
// Ok we move on grass hoppa.
foreach ($section['sub_section'] as $name => $sub_section)
{
$xml .= '
<sec sec-type="content">
' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars($sub_section['title']) . '</title>');
// Now the data.
foreach ($sub_section['data'] as $key => $data)
{
// Do we have any reference to a table here?
// Bullet list?
if ($data == '<ul>')
$xml .= '
<list list-type="bullet">';
// How about ordered?
elseif ($data == '<ol>')
$xml .= '
<list list-type="order">';
// List items?
elseif (substr($data, 0, 4) == '<li>')
$xml .= '
<list-item><p>' . xml_replacement(htmlspecialchars(substr($data, 4, -5))) . '</p></list-item>';
// Closing them?
elseif ($data == '</ul>' || $data == '</ol>')
$xml .= '
</list>';
elseif ($data == '<p>')
{
$xml .= '
<p>';
$open_p = true;
}
elseif ($data == '</p>')
{
$xml .= '
</p>';
$open_p = false;
}
elseif ($open_p)
$xml .= '
' . xml_replacement(htmlspecialchars($data));
elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
$xml .= '
<p>' . xml_replacement(htmlspecialchars($matches[1])) . '</p>';
else
$xml .= '
' . xml_replacement(htmlspecialchars($data));
}
$xml .= '
</sec>';
}
$xml .= '
</sec>';
}
echo $xml;
$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
echo "\nWROTE AND SAVED FILE";
fclose($fp);
function xml_replacement($data)
{
if (preg_match('~Table \d+~', $data))
$data = preg_replace('~Table (\d+)~', '<xref ref-type="table" rid="T\1">Table \1</xref>', $data);
return $data;
}
?>