News:

Please note these forums are mostly a testing ground for my SMF work and I don't really use them otherwise.

Main Menu

Paste-1219954535:v:use_geshi-1:v:type-php

Started by JayBachatero, Aug 28, 2008, 08:15 PM

Previous topic - Next topic

0 Members and 4 Guests are viewing this topic.

JayBachatero

<?php

$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);

// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
// References???
$found_reference = false;
$in_list = false;
// Tables?
$tables_count = 0;

$front = array(
   'article_id' => 0,
   'publication' => 0,
   'column' => '',
   'volume' => '',
   'issue' => '',
   'pages' => '',
   'date' => array(
      'day' => date('d'),
      'month' => date('m'),
      'year' => date('Y'),
   ),
   'pub_abbr' => '',
);
$back = array(
   'references' => array(),
   'ack' => '',
   'financial' => '',
   'conflict' => '',
   'abbr' => '',
   'reprint' => '',
   'sidebar' => array(),
);

$sections = array();
foreach ($lines as $key => $line)
{
   // Clean spaces.
   $line = trim($line);

   // No blank lines.
   if (empty($line))
      continue;

   // Skip the head matter.
   if (!$in_html_body && $line != '<body>')
      continue;
   elseif (!$in_html_body && $line == '<body>')
   {
      $in_html_body = true;
      continue;
   }

   // We just end here.  We don't parse the table and image stuff just yet.
   if (in_array($line, array('</body>', '</html>')))
      break;

   // We continue with these.
   if (in_array($line, array('<!--', '-->')))
      continue;

   // Lets just do the head stuff first since it's usually on top.
   if (substr($line, 0, 11) == 'Article ID:')
   {
      $front['article_id'] = trim(substr($line, 11));
      continue;
   }
   elseif (substr($line, 0, 15) == 'Publication ID:')
   {
      $front['publication'] = trim(substr($line, 15));
      continue;
   }
   elseif (substr($line, 0, 18) == 'Section/Column ID:')
   {
      $front['column'] = trim(substr($line, 18));
      continue;
   }
   elseif (substr($line, 0, 7) == 'Volume:')
   {
      $front['volume'] = trim(substr($line, 7));
      continue;
   }
   elseif (substr($line, 0, 6) == 'Issue:')
   {
      $front['issue'] =  trim(substr($line, 6));
      continue;
   }
   elseif (substr($line, 0, 6) == 'Pages:')
   {
      $front['pages'] = trim(substr($line, 6));
      continue;
   }
   elseif (substr($line, 0, 9) == 'Pub Date:')
   {
      list ($month, $day, $year) = explode('/', trim(substr($line, 9)));
      $front['date']['month'] = empty($month) ? date('m') : $month;
      $front['date']['day'] = empty($day) ? date('d') : $day;
      $front['date']['year'] = empty($year) ? date('Y') : $year;
      continue;
   }
   elseif (substr($line, 0, 8) == 'Surname:')
   {
      $front['surname'] = trim(substr($line, 8));
      continue;
   }
   elseif (substr($line, 0, 11) == 'Disclosure:')
   {
      $front['disclosure'] = trim(substr($line, 11));
      continue;
   }
   elseif (substr($line, 0, 4) == 'Bio:')
   {
      $front['bio'] = trim(substr($line, 4));
      continue;
   }
   elseif (substr($line, 0, 7) == 'Images:')
   {
      $front['has_images'] = trim(substr($line, 7)) == 'true' ? true : false;
      continue;
   }
   elseif (substr($line, 0, 9) == 'Pub Abbr:')
   {
      $front['pub_abbr'] = trim(substr($line, 9));
      continue;
   }
   elseif (substr($line, 0, 4) == 'Ack:')
   {
      $back['ack'] = trim(substr($line, 4));
      continue;
   }
   elseif (substr($line, 0, 10) == 'Financial:' || substr($line, 0, 8) == 'Funding:')
   {
      $back['financial'] = trim(substr($line, substr($line, 0, 10) == 'Financial:' ? 10 : 8));
      continue;
   }
   elseif (substr($line, 0, 9) == 'Conflict:')
   {
      $back['conflict'] = trim(substr($line, 9));
      continue;
   }
   elseif (substr($line, 0, 5) == 'Abbr:')
   {
      $back['abbr'] = trim(substr($line, 5));
      continue;
   }
   elseif (substr($line, 0, 8) == 'Reprint:')
   {
      $back['reprint'] = trim(substr($line, 8));
      continue;
   }

   // References here.
   if (trim($line) == '<h3>References</h3>')
   {
      $found_reference = true;
      continue;
   }

   // In a first level section and we already in one.  Just go to the new one.
   if ($in_section && !$found_reference && substr($line, 0, 3) == '<h3')
   {
      $in_section = false;
      $in_sub_section = false;
      $title = '';
      $sub_title = '';
      $section_type = '';
   }

   // A section?
   if (!$in_section && !$found_reference && preg_match('~^<h3(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h3>$~', $line, $matches))
   {
      $in_section = true;
      $title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);

      // Clean title.
      $clean_title = strtolower(substr($title, 0, 50));

      // We start a new section in the array.
      $sections[$clean_title] = array(
         'title' => $title == '.' ? '' : $title,
         'sub_section' => array(),
      );

      if (count($matches) == 3)
         $sections[$clean_title]['type'] = trim($matches[1]);

      // NEXT...
      continue;
   }

   // First lets check if we even doing references yet then check for list.
   if ($found_reference && ($line == '<ol>' || $line == '<ul>'))
   {
      $in_list = true;
      $reference_ordered = $line == '<ol>' ? true : false;
      continue;
   }

   if ($in_list && $found_reference && substr(trim($line), 0, 4) == '<li>')
   {
      $back['references'][] = htmlspecialchars_uni(substr(trim($line), 4, -5));
      continue;
   }

   if ($in_list && $found_reference && ($line == '</ol>' || $line == '</ul>'))
   {
      $found_reference = false;
      $in_list = false;
      continue;
   }

   // In a sub section and another one just pops up? We no longer in the other one then.  One section at a time.
   if ($in_section && $in_sub_section && substr($line, 0, 3) == '<h4')
      $in_sub_section = false;

   // Sub section.
   if ($in_section && !$in_sub_section && preg_match('~^<h4(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h4>$~', $line, $matches))
   {
      $in_sub_section = true;
      $sub_title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);

      // Clean sub title.
      $clean_sub_title = strtolower(substr($sub_title, 0, 50));

      // Type?
      $section_type = '';
      if (count($matches) == 3)
         $section_type = trim($matches[1]);

      // We don't need any more data from this line so... NEXT!!!
      continue;
   }
   elseif ($in_section && !$in_sub_section)
   {
      $in_sub_section = true;
      $clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
   }

   if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
      $sections[$clean_title]['sub_section'][$clean_sub_title] = array(
         'title' => $sub_title,
         'type' => $section_type,
         'data' => array(),
      );

   // Now for the data.
   $sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}

$body = '
   <body>';

foreach ($sections as $id => $section)
{
   // Sidebar?  They go in the back.
   if (isset($section['type']) && $section['type'] == 'sidebar')
   {
      $back['sidebar'][] = $section;
      continue;
   }

   // First level?  Then it's a page.
   $body .= '
      <sec sec-type="page">
         ' . (empty($section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($section['title']) . '</title>');

   // Ok we move on grass hoppa.
   foreach ($section['sub_section'] as $name => $sub_section)
   {
      $body .= '
         <sec sec-type="' . (empty($sub_section['type']) ? 'content' : $sub_section['type']) . '">
            ' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($sub_section['title']) . '</title>');

      // Now the data.
      $body .= parse_content($sub_section['data']);

      $body .= '
         </sec>';
   }

   $body .= '
      </sec>';
}

$body .= '
   </body>';

// Generate the front section
$front_xml = generate_front($front);
// References
$back_xml = generate_back($back);

$xml = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
<article>' . $front_xml . $body . $back_xml . '
</article>';

// Emails
$xml = preg_replace('~([0-9A-Za-z=_+\-/][0-9A-Za-z=_\'+\-/\.]*@[\w\-]+(\.[\w\-]+)*(\.[\w]{2,6}))~', '<ext-link ext-link-type="mailto" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="\1" xlink:title="\1"/>', $xml);

echo $xml;

$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
   echo "\nWROTE AND SAVED FILE";
fclose($fp);

function xml_replacement($data)
{
   global $front;

   $data = preg_replace('~(Table|Exhibit|Box) (\d+)~', '<xref ref-type="table" rid="T\2">\1 \2</xref>', $data);

   // Images now?
   if (isset($front['pub_abbr']) && preg_match('~\[\[(InlineFigure|Figure)\|\^\|\d+\|\^\|.+?\]\]~', $data, $matches))
   {
      if (trim($matches[1]) == 'InlineFigure')
         $data = preg_replace('~\[\[InlineFigure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
            <inline-graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-" . $front['pub_abbr'] . $front['article_id'] . ".equation$1.gif\">
               <alt-text>Equation</alt-text>
            </inline-graphic>
            ", $data);
      else
         $data = preg_replace('~\[\[Figure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
            <fig id=\"F\$1\">
               <label>Figure \$1.</label>
               <caption>
                  <p>" . htmlspecialchars('\2') . "</p>
               </caption>

               <alt-text>Figure \$1</alt-text>
               <graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" alt-version=\"yes\"/>
               <graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"thumb-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/thumb-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" alternate-form-of=\"art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\"/>
            </fig>", $data);
   }

   if (preg_match('~\[\[Figure\|\^\|\d+\|\^\|.+?\|\^\|.+?\]\]~', $data, $matches))
   {
      $data = preg_replace('~\[\[(?:InlineFigure|Figure)\|\^\|(\d+)\|\^\|(.+?)\|\^\|(.+?)\]\]~', "
            <fig id=\"F\$1\">
               <label>Figure \$1.</label>
               <caption>
                  <p>" . htmlspecialchars('\3') . "</p>
               </caption>

               <alt-text>Figure \$1</alt-text>
               <graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"art-\$2.fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-\$2.fig\$1.gif\" alt-version=\"yes\"/>
               <graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"thumb-\$2.fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/thumb-\$2.fig\$1.gif\" alternate-form-of=\"art-\$2.fig\$1.gif\"/>
            </fig>", $data);
   }

   return $data;
}

function generate_front($front)
{
   $front_xml = (isset($front['has_images']) && $front['has_images'] == true ? '
   <' . '?figurePage print="true"?' . '>' : '') . '
   <front>
      <journal-meta>
         <journal-id journal-id-type="publication">' . (!empty($front['publication']) ? (int) $front['publication'] : '') . '</journal-id>
         <issn/>
         <publisher>
            <publisher-name/>
         </publisher>
      </journal-meta>

      <article-meta>
         <article-id>' . (isset($front['article_id']) ? (int) $front['article_id'] : '') . '</article-id>

         <article-categories>
            <subj-group>
               <subject>journalArticle</subject>
            </subj-group>
            <series-title></series-title>' . (isset($front['column']) ? '
            <series-title series-type="column">' . $front['column'] . '</series-title>' : '') . '
         </article-categories>

         <title-group>
            <article-title></article-title>
            <subtitle></subtitle>
         </title-group>

         <contrib-group>
            <contrib author-id="0" contrib-type="author">
               <name>
                  <surname>' . (isset($front['surname']) ? $front['surname'] : '') . '</surname>
               </name>
               <bio>
                  <p>' . (isset($front['bio']) ? htmlspecialchars_uni($front['bio']) : '') . '</p>
               </bio>
               <role>Additional Author</role>
               <author-comment>
                  <p>' . (isset($front['disclosure']) ? htmlspecialchars_uni($front['disclosure']) : '') . '</p>
               </author-comment>
            </contrib>
         </contrib-group>

         <pub-date>
            <day>' . (isset($front['date']['day']) ? sprintf("%02d", (int) $front['date']['day']) : '') . '</day>
            <month>' . (isset($front['date']['month']) ? sprintf("%02d", (int) $front['date']['month']) : '') . '</month>
            <year>' . (isset($front['date']['year']) ? (int) $front['date']['year'] : '') . '</year>
         </pub-date>

         <volume>' . (!empty($front['volume']) ? (int) $front['volume'] : '') . '</volume>
         <issue>' . (!empty($front['issue']) ? (int) $front['issue'] : '') . '</issue>
         <fpage>' . (isset($front['pages']) ? $front['pages'] : '') . '</fpage>

         <copyright-year/>

         <abstract>
            <title></title>
            <p></p>
         </abstract>
      </article-meta>
   </front>';

   return $front_xml;
}

function generate_references($references)
{
   global $reference_ordered;

   if (empty($references))
      return '';

   $xml = '
      <ref-list>
         <title>References</title>
         <list list-type="' .  ($reference_ordered ? 'order' : 'bullet') . '">';
   foreach ($references as $reference)
      $xml .= '
            <list-item><p>' . $reference . '</p></list-item>';
   $xml .= '
         </list>
      </ref-list>';

   return $xml;
}

function generate_back($back)
{
   global $front, $body;

   $xml = '
   <back>';
   $notes = '';

   // Tables?
   if (preg_match_all('~(Table|Exhibit|Box) (\d+)~', $body, $matches))
      $tables_count = count(array_unique($matches[2]));

   // Tables.
   if (!empty($tables_count))
   {print_r($matches);
      $xml .= '
      <sec sec-type="table">
         <title/>
         <table-wrap-group>';
      for ($i = 1; $i <= $tables_count; $i++)
         $xml .=   '
            <table-wrap id="T' . $i . '">
               <label>' . (strstr($matches[1][$i - 1], 'Exhibit') ? 'Exhibit ' : (strstr($matches[1][$i - 1], 'Box') ? 'Box ' : 'Table ')) . $i . '.</label>
               <caption>
                  <p></p>
               </caption>
               <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="/images/' . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . '/art-' . (isset($front['pub_abbr']) ? $front['pub_abbr'] : '') . $front['article_id'] . '.tab' . $i . '.gif" alt-version="no"/>
            </table-wrap>';
      $xml .= '
         </table-wrap-group>
      </sec>';
   }

   // References
   if (!empty($back['references']))
      $xml .= generate_references($back['references']);

   // Ack
   if (!empty($back['ack']))
      $xml .= '
      <ack>
         <title>Acknowledgements</title>
         <p>' . htmlspecialchars($back['ack']) . '</p>
      </ack>';

   if (!empty($back['financial']))
      $notes .= '
         <fn fn-type="supported-by">
            <p>' . htmlspecialchars($back['financial']) . '</p>
         </fn>';

   if (!empty($back['conflict']))
      $notes .= '
         <fn fn-type="conflict">
            <p>' . htmlspecialchars($back['conflict']) . '</p>
         </fn>';

   if (!empty($back['abbr']))
      $notes .= '
         <fn fn-type="abbr">
            <p>' . htmlspecialchars($back['abbr']) . '</p>
         </fn>';

   if (!empty($back['reprint']))
      $notes .= '
         <fn fn-type="present-address">
            <p>' . htmlspecialchars($back['reprint']) . '</p>
         </fn>';

   if (!empty($notes))
      $xml .= '
      <fn-group>' . $notes . '
      </fn-group>';

   if (!empty($back['sidebar']))
   {
      $xml .= '
      <app-group>';

      foreach ($back['sidebar'] as $sidebar)
         foreach ($sidebar['sub_section'] as $sub_section2)
            $xml .= '
         <app>
            <title/>
            <sec sec-type="sidebar">
               <title>Sidebar: ' . $sidebar['title'] . '</title>
               ' . parse_content($sub_section2['data']) . '
            </sec>
         </app>';

      $xml .= '
      </app-group>';
   }
   $xml .= '
   </back>';

   return $xml;
}

function parse_content($data_array)
{
   $ret = '';

   foreach ($data_array as $data)
   {
      // Bullet list?
      if ($data == '<ul>')
         $ret .= '
            <list list-type="bullet">';
      // How about ordered?
      elseif ($data == '<ol>')
         $ret .= '
            <list list-type="order">';
      elseif ($data == '<ol type="a">')
         $ret .= '
            <list list-type="alpha-lower">';
      elseif ($data == '<ol type="A">')
         $ret .= '
            <list list-type="alpha-upper">';
      // List items?
      elseif (substr($data, 0, 4) == '<li>')
         $ret .= '
               <list-item><p>' . xml_replacement(htmlspecialchars_uni(substr($data, 4, -5))) . '</p></list-item>';
      // Closing them?
      elseif ($data == '</ul>' || $data == '</ol>')
         $ret .= '
            </list>';
      elseif ($data == '<p>')
      {
         $ret .= '
            <p>';
         $open_p = true;
      }
      elseif ($data == '</p>')
      {
         $ret .= '
            </p>';
         $open_p = false;
      }
      elseif ($open_p)
         $ret .= '
               ' . xml_replacement(htmlspecialchars_uni($data));
      elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
         $ret .= '
            <p>' . xml_replacement(htmlspecialchars_uni($matches[1])) . '</p>';
      else
         $ret .= '
            ' . xml_replacement(htmlspecialchars_uni($data));
   }

   return $ret;
}

function htmlspecialchars_uni($string)
{
   $string = str_replace(array('', '', '', ''), array('', '', '', ''), $string);
   return htmlspecialchars($string);
   return htmlspecialchars($string, ENT_NOQUOTES);
}

?>