News:

Please note these forums are mostly a testing ground for my SMF work and I don't really use them otherwise.

Main Menu

Paste-1207762485:v:use_geshi-1:v:type-php

Started by JayBachatero, Apr 09, 2008, 05:34 PM

Previous topic - Next topic

0 Members and 2 Guests are viewing this topic.

JayBachatero

<?php

$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);

// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
// References???
$found_reference = false;
$in_list = false;

$front = array(
   'article_id' => 0,
   'publication' => 0,
   'column' => '',
   'volume' => '',
   'issue' => '',
   'pages' => '',
   'date' => array(
      'day' => date('d'),
      'month' => date('m'),
      'year' => date('Y'),
   )
);
$back = array(
   'references' => array(),
   'ack' => '',
);

$sections = array();
foreach ($lines as $key => $line)
{
   // Clean spaces.
   $line = trim($line);

   // No blank lines.
   if (empty($line))
      continue;

   if (!$in_html_body && $line != '<body>')
      continue;
   elseif (!$in_html_body && $line == '<body>')
   {
      $in_html_body = true;
      continue;
   }

   // We just end here.  We don't parse the table and image stuff just yet.
   if (in_array($line, array('</body>', '</html>')))
      break;

   // We continue with these.
   if (in_array($line, array('<!--', '-->')))
      continue;

   // Lets just do the head stuff first since it's usually on top.
   if (substr($line, 0, 11) == 'Article ID:')
   {
      $front['article_id'] = trim(substr($line, 11));
      continue;
   }
   elseif (substr($line, 0, 15) == 'Publication ID:')
   {
      $front['publication'] = trim(substr($line, 15));
      continue;
   }
   elseif (substr($line, 0, 18) == 'Section/Column ID:')
   {
      $front['column'] = trim(substr($line, 18));
      continue;
   }
   elseif (substr($line, 0, 7) == 'Volume:')
   {
      $front['volume'] = trim(substr($line, 7));
      continue;
   }
   elseif (substr($line, 0, 6) == 'Issue:')
   {
      $front['issue'] =  trim(substr($line, 6));
      continue;
   }
   elseif (substr($line, 0, 6) == 'Pages:')
   {
      $front['pages'] = trim(substr($line, 6));
      continue;
   }
   elseif (substr($line, 0, 9) == 'Pub Date:')
   {
      list ($month, $day, $year) = explode('/', trim(substr($line, 9)));
      $front['date']['month'] = empty($month) ? date('m') : $month;
      $front['date']['day'] = empty($day) ? date('d') : $day;
      $front['date']['year'] = empty($year) ? date('Y') : $year;
      continue;
   }
   elseif (substr($line, 0, 8) == 'Surname:')
   {
      $front['surname'] = trim(substr($line, 8));
      continue;
   }
   elseif (substr($line, 0, 11) == 'Disclosure:')
   {
      $front['disclosure'] = trim(substr($line, 11));
      continue;
   }
   elseif (substr($line, 0, 4) == 'Bio:')
   {
      $front['bio'] = trim(substr($line, 4));
      continue;
   }
   elseif (substr($line, 0, 4) == 'Ack:')
   {
      $back['ack'] = trim(substr($line, 4));
      continue;
   }

   // References here.
   if (trim($line) == '<h3>References</h3>')
   {
      $found_reference = true;
      continue;
   }

   // In a first level section and we already in one.  Just go to the new one.
   if ($in_section && !$found_reference && substr($line, 0, 4) == '<h3>')
   {
      $in_section = false;
      $in_sub_section = false;
      $title = '';
      $sub_title = '';
   }

   // A section?
   if (!$in_section && !$found_reference && preg_match('~^<h3>(.+?)</h3>$~', $line, $matches))
   {
      $in_section = true;
      $title = trim($matches[1]);

      // Clean title.
      $clean_title = strtolower(substr($title, 0, 50));

      // We start a new section in the array.
      $sections[$clean_title] = array(
         'title' => $title,
         'sub_section' => array(),
      );

      // NEXT...
      continue;
   }

   // First lets check if we even doing references yet then check for list.
   if ($found_reference && trim($line) == '<ol>')
   {
      $in_list = true;
      continue;
   }

   if ($in_list && $found_reference && substr(trim($line), 0, 4) == '<li>')
   {
      $back['references'][] = htmlspecialchars(substr(trim($line), 4, -5));
      continue;
   }

   if ($in_list && $found_reference && trim($line) == '</ol>')
   {
      $found_reference = false;
      $in_list = false;
      continue;
   }

   // In a sub section and another one just pops up? We no longer in the other one then.  One section at a time.
   if ($in_section && $in_sub_section && substr($line, 0, 4) == '<h4>')
      $in_sub_section = false;

   // Sub section.
   if ($in_section && !$in_sub_section && preg_match('~^<h4>(.+?)</h4>$~', $line, $matches))
   {
      $in_sub_section = true;
      $sub_title = trim($matches[1]);

      // Clean sub title.
      $clean_sub_title = strtolower(substr($sub_title, 0, 50));

      // We don't need any more data from this line so... NEXT!!!
      continue;
   }
   elseif ($in_section && !$in_sub_section)
   {
      $in_sub_section = true;
      $clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
   }

   if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
      $sections[$clean_title]['sub_section'][$clean_sub_title] = array(
         'title' => $sub_title,
         'data' => array(),
      );

   // Now for the data.
   $sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}

$body = '

   <body>';

foreach ($sections as $id => $section)
{
   // First level?  Then it's a page.
   $body .= '
      <sec sec-type="page">
         <title>' . htmlspecialchars($section['title']) . '</title>';

   // Ok we move on grass hoppa.
   foreach ($section['sub_section'] as $name => $sub_section)
   {
      $body .= '
         <sec sec-type="content">
            ' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars($sub_section['title']) . '</title>');

      // Now the data.
      foreach ($sub_section['data'] as $key => $data)
      {
         // Do we have any reference to a table here?

         // Bullet list?
         if ($data == '<ul>')
            $body .= '
            <list list-type="bullet">';
         // How about ordered?
         elseif ($data == '<ol>')
            $body .= '
            <list list-type="order">';
         // List items?
         elseif (substr($data, 0, 4) == '<li>')
            $body .= '
               <list-item><p>' . xml_replacement(htmlspecialchars(substr($data, 4, -5))) . '</p></list-item>';
         // Closing them?
         elseif ($data == '</ul>' || $data == '</ol>')
            $body .= '
            </list>';
         elseif ($data == '<p>')
         {
            $body .= '
            <p>';
            $open_p = true;
         }
         elseif ($data == '</p>')
         {
            $body .= '
            </p>';
            $open_p = false;
         }
         elseif ($open_p)
            $body .= '
               ' . xml_replacement(htmlspecialchars($data));
         elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
            $body .= '
            <p>' . xml_replacement(htmlspecialchars($matches[1])) . '</p>';
         else
            $body .= '
            ' . xml_replacement(htmlspecialchars($data));
      }

      $body .= '
         </sec>';
   }

   $body .= '
      </sec>';
}

$body .= '
   </body>';

// Generate the front section
$front_xml = generate_front($front);
// References
$back_xml = generate_back($back);

$xml = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
<article>' . $front_xml . $body . $back_xml . '
</article>';

echo $xml;

$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
   echo "\nWROTE AND SAVED FILE";
fclose($fp);

function xml_replacement($data)
{
   if (preg_match('~Table \d+~', $data))
      $data = preg_replace('~Table (\d+)~', '<xref ref-type="table" rid="T\1">Table \1</xref>', $data);

   return $data;
}

function generate_front($front)
{
   $front_xml = '
   <front>
      <journal-meta>
         <journal-id journal-id-type="publication">768</journal-id>
         <issn/>
         <publisher>
            <publisher-name/>
         </publisher>
      </journal-meta>

      <article-meta>
         <article-id>' . (isset($front['article_id']) ? (int) $front['article_id'] : '') . '</article-id>

         <article-categories>
            <subj-group>
               <!-- In the <subject> field, enter ONLY ONE of the following, spelled EXACTLY as follows: recResource; abstract; clinicalReference; faq; interactiveCase; journalArticle; journalScan; news; profAskTheExpert -->
               <subject>journalArticle</subject>
            </subj-group>
            <series-title></series-title>' . (isset($front['column']) ? '
            <series-title series-type="column">' . $front['column'] . '</series-title>' : '') . '
         </article-categories>

         <title-group>
            <article-title></article-title>
            <subtitle></subtitle>
         </title-group>

         <contrib-group>
            <contrib author-id="0" contrib-type="author">
               <name>
                  <!-- Author names go within the <surname> tags, separated by semicolons -->
                  <surname>' . (isset($front['surname']) ? $front['surname'] : '') . '</surname>
               </name>
               <bio>
                  <!-- Author Bio text goes within the <p>tags -->
                  <p>' . (isset($front['bio']) ? htmlspecialchars($front['bio']) : '') . '</p>
               </bio>
               <role>Additional Author</role>
               <author-comment>
                  <!-- Author Disclosure text goes within the <p>tags -->
                  <p>' . (isset($front['disclosure']) ? htmlspecialchars($front['disclosure']) : '') . '</p>
               </author-comment>
            </contrib>
         </contrib-group>

         <pub-date>
            <day>' . (isset($front['date']['day']) ? sprintf("%02d", (int) $front['date']['day']) : '') . '</day>
            <month>' . (isset($front['date']['month']) ? sprintf("%02d", (int) $front['date']['month']) : '') . '</month>
            <year>' . (isset($front['date']['year']) ? (int) $front['date']['year'] : '') . '</year>
         </pub-date>

         <volume>' . (!empty($front['volume']) ? (int) $front['volume'] : '') . '</volume>
         <issue>' . (!empty($front['issue']) ? (int) $front['issue'] : '') . '</issue>
         <fpage>' . (isset($front['pages']) ? $front['pages'] : '') . '</fpage>

         <copyright-year/>

         <abstract>
            <title>Abstract</title>
            <p></p>
         </abstract>
      </article-meta>
   </front>';

   return $front_xml;
}

function generate_references($references)
{
   if (empty($references))
      return '';

   $xml = '
      <ref-list>
         <title>References</title>
         <list list-type="order">';
   foreach ($references as $reference)
      $xml .= '
            <list-item><p>' . $reference . '</p></list-item>';
   $xml .= '
         </list>
      </ref-list>';

   return $xml;
}

function generate_back($back)
{
   $xml = '
   <back>';

   // References
   if (!empty($back['references']))
      $xml .= generate_references($back['references']);

   // Ack
   if (!empty($back['ack']))
      $xml .= '
      <ack>
         <title>Acknowledgements</title>
         <p>' . htmlspecialchars($back['ack']) . '</p>
      </ack>';

   $xml = '
   </back>';

   return $xml;
}

?>