<?php
/*
author: Timotheus Pokorra (timotheus@pokorra.de)
file: parsebook.php
autumn 2002
feel free to use and modify
no guarantuee
*/
?>

<?php
/* written by Timotheus Pokorra, 31.08.2002 */

    function copyValue(&$destination, &$attributes, $field)
    {
        $destination[$attributes['id']][$field] = $attributes[$field];
    }
    function printChapterNum(&$chapterNum)
    {
        $s = "";
        for($counter=0; $counter < count($chapterNum); $counter++)
        {
           $num = $chapterNum[$counter];
           if ($num != 0)
           {
               if ($s != "")
                    $s = $s.".";
               $s = $s.$num;
           }
           else $counter = count($chapterNum);
        }
        return $s;
    }

    function replaceKeyword($keyword, $text)
    {
            $result=array();
            $poshttp = 1;
			$posstart=0;
            while ($poshttp >= 0 && !($poshttp === false))
            {
                $poshttp = strpos(strtoupper($text), strtoupper($keyword), $posstart);
				
                if ($poshttp >= 0 && !($poshttp === false))
                {
					$before_ok = ($poshttp == 0) || !is_alnum($text[$poshttp-1]);
					$after_ok = $poshttp+strlen($keyword) >= strlen($text) || !is_alnum($text[$poshttp+strlen($keyword)])
						|| ($text[$poshttp+strlen($keyword)] == 's');
					if (!$before_ok || !$after_ok )
					{
						$posstart = $poshttp+strlen($keyword)-1;
					}
					else
					{
						 $posstart = 0;
						 $posafter = $poshttp + strlen($keyword);
						 $before = substr($text, 0, $poshttp);
						 $between = substr($text, $poshttp, $posafter-$poshttp);
						 $after = substr($text, $posafter);
						 $item = array();
						 $item['type'] = 'text';
						 $item['value'] = $before;
						 $result[] = $item;
						 $item['type'] = 'keyword';
						 $item['value'] = $between;
						 $item['id'] = $keyword;
						 $result[] = $item;
						 $text = $after;
					}
                }
            }
            $item = array();
            $item['type'] = 'text';
            $item['value'] = $text;
            $result[] = $item;
            return $result;
    }
    function replaceHTTP2($text)
    {
            $result=array();
            $poshttp = 1;
            while ($poshttp >= 0 && !($poshttp === false))
            {
                $poshttp = strpos(strtoupper($text), strtoupper('http://'));
                if ($poshttp >= 0 && !($poshttp === false))
                {
                    $posafter = strpos($text,' ', $poshttp);
					$posafter2 = strpos($text,"<", $poshttp);
					if (!($posafter2===false) && $posafter2 < $posafter)
					   $posafter = $posafter2;
                    if (!$posafter > 0)
                        $posafter = strlen($text);
                     $before = substr($text, 0, $poshttp);
                     $http = substr($text, $poshttp, $posafter-$poshttp);
                     while (!is_alnum($http[strlen($http)-1]))
                     {
                        $posafter--;
                        $http = substr($text, $poshttp, $posafter-$poshttp);
                     }
                     $after = substr($text, $posafter);
                     $item = array();
                     $item['type'] = 'text';
                     $item['value'] = $before;
                     $result[] = $item;
                     $item['type'] = 'http';
                     $item['value'] = $http;
                     $item['id'] = $keyword;
                     $result[] = $item;
                     $text = $after;
                }
            }
            $item = array();
            $item['type'] = 'text';
            $item['value'] = $text;
            $result[] = $item;
            return $result;
    }

    function buildIndexParagraph(&$book, &$paragraph, $printword)
    {
        if ($paragraph != 0)
            for($counter = 0; $counter < count($paragraph); $counter++)
            {
                $item = $paragraph[$counter];
                if ($item['type'] == 'struct')
                     $paragraph[$counter]['value'] =
                        buildIndexParagraph(&$book, $item['value'], $printword);
                else if ($item['type'] == 'text')
                {
                     $result = $item['value'];
					 if (!$printword)
						$result = replaceHTTP2($result);
                     $num = count($result);
                     if ($num > 1)
                     {
                         $paragraph[$counter]['type'] = 'struct';
                         $paragraph[$counter]['value'] = $result;
                         $item['value'] = "";
                         $counter--;
                     }
                     else
                         foreach($book['keywords'] as $keyword => $count)
                         {
                             $result = replaceKeyword($keyword, $item['value']);
                             $num = count($result);
                             if ($num > 1)
                             {
                                 $paragraph[$counter]['type'] = 'struct';
                                 $paragraph[$counter]['value'] = $result;
                                 $item['value'] = "";
                                 $counter--;
                             }
                         }
                }
            }
            return $paragraph;
    }

    function buildIndexChapter (&$book, &$chapter, $printword)
    {
        $chapter['title'] = buildIndexParagraph($book, $chapter['title'], $printword);
        if ($chapter['struct'] != 0)
            foreach($chapter['struct'] as $key => $item)
            {
                if ($item['type'] == 'chapter')
                   $chapter['struct'][$key]['value'] =
                       buildIndexChapter($book, $item['value'], $printword);
                if ($item['type'] == 'paragraph')
                   $chapter['struct'][$key]['value'] =
                       buildIndexParagraph($book, $item['value'], $printword);
            }
        return $chapter;
    }

    function buildIndex(&$book, $printword)
    {
            foreach($book['chapters'] as $key => $chapter)
                $book['chapters'][$key] = buildIndexChapter($book, $chapter, $printword);
    }
    function readHTMLItem(&$value, &$result, &$item, $htmltag)
    {
         $s = $value['value'];
		 $atts = $value['attributes']; 
         {
             $result[$item]['type']='text';
		     if (completeTag($value))
			 {
                if ($value['tag'] != 'a')
					$s = replaceHTTP($s, 1);

				$result[$item]['value']='&llt;'.$htmltag;
				if ($atts  != 0)
					foreach ( $atts as $key => $val )
						$result[$item]['value'] .= ' '.$key.' = '.$val;
				$result[$item]['value'] .= '&ggt;'.$s.'&llt;/'.$htmltag.'&ggt;';
             }
			 else if (beginTag($value))
                 $result[$item]['value']='&llt;'.$htmltag.'&ggt;'.$s;
			 else if (endTag($value))
                 $result[$item]['value']=$s.'&llt;/'.$htmltag.'&ggt;';
             else
                 $result[$item]['value']=$s;
             $item++;
         }
    }

    function parseParagraph(&$values, &$counter,
                 &$figureNum, &$figures, &$sources, &$keywords)
    {
        $result = array();
        $value = $values[$counter];
        $item = 0;
        for (; $counter < count($values); $counter++)
        {
            $value = $values[$counter];
            $attributes = getAttributes($value);
            if ($value['tag'] == 'FIGURE')
            {
               if ($attributes['src'] != "")
               {
                    copyValue(&$figures, $attributes, 'src');
                    copyValue(&$figures, $attributes, 'title');
                    $figureNum[1]++;
                    $figures[$attributes['id']]['num'] =
                          $figureNum[0].".".$figureNum[1];
               }
               if ($attributes['link'] != -1)
               {
                   $result[$item]['type']='figurelink';
                   $result[$item]['id']=$attributes['id'];
                   $item++;
               }
               else
               {
                   $result[$item]['type']='figure';
                   $result[$item]['id']=$attributes['id'];
                   $item++;
               }
            }
            else if ($value['tag'] == 'SOURCE')
            {
               if ($attributes['title'] != "")
               {
                    copyValue($sources, $attributes, 'short');
                    copyValue($sources, $attributes, 'author');
                    copyValue($sources, $attributes, 'title');
                    copyValue($sources, $attributes, 'isbn');
                    copyValue($sources, $attributes, 'publisher');
                    copyValue($sources, $attributes, 'publishedin');
                    copyValue($sources, $attributes, 'quotedin');
                    copyValue($sources, $attributes, 'quotedpage');
                    copyValue($sources, $attributes, 'datepublished');
                    copyValue($sources, $attributes, 'internet');
                    copyValue($sources, $attributes, 'local');
                    copyValue($sources, $attributes, 'datefound');
               }
               if ($attributes['show'] != -1)
               {
                   $result[$item]['type']='source';
                   $result[$item]['id']=$attributes['id'];
                   $result[$item]['page']=$attributes['page'];
                   $item++;
               }
            }
            else if ($value['tag'] == 'KEYWORD')
            {
                 if ($attributes['id'] == "")
                      $attributes['id'] = $value['value'];
                 $keywords[$attributes['id']]=array();
                 if ($attributes['show'] != -1)
                 {
                    $result[$item]['type']='keyword';
                    $result[$item]['id']=$attributes['id'];
                    $result[$item]['value']=$value['value'];
					$result[$item]['show']=$attributes['show'];
					$result[$item]['plural']=$attributes['plural'];
                    $item++;
                 }
            }
            else if ($value['tag'] == 'ABBREVIATION')
            {
                 if ($attributes['id'] == "")
                      $attributes['id'] = $value['value'];
                 //$keywords[$attributes['id']]++;
                 $keywords[$attributes['id']] = array();
				 $keywords[$attributes['id']]['long'] = $attributes['long'];
				 if ($attributes['show'] != -1)
                 {
                    $result[$item]['type']='keyword';
                    $result[$item]['id']=$attributes['id'];
                    $result[$item]['value']=$value['value'];
					if ($attributes['long'])
					{
						$result[$item]['long'] = $attributes['long'];
						$result[$item]['value']=$attributes['id'];
						$result[$item]['show']=$attributes['show'];
                    }
                    $item++;
                 }
            }
            else if ($value['tag'] == 'COMMAND')
            {
                 $result[$item]['type']='command';
                 $result[$item]['readfile']=$attributes['readfile'];
                 $item++;
            }
            else if ($value['tag'] == 'HTML')
            {
                 $s = readHtml($values, $counter, 'HTML');
                 $result[$item]['type']='text';
                 $result[$item]['value']=$s;
                 $item++;
            }
            else if ($value['tag'] == 'PRE')
            {
                 $s = readHtml($values, $counter, 'PRE');
                 $result[$item]['type']='pre';
                 $result[$item]['value']=$s;
                 $item++;
            }
            else if ($value['tag'] == 'B')
            {
                 $s = readHtml($values, $counter, 'B');
                 $result[$item]['type']='bold';
                 $result[$item]['value']=$s;
                 $item++;
            }
            else if ($value['tag'] == 'BR')
            {
                 $result[$item]['type']='br';
                 $item++;
            }
            else if ($value['tag'] == 'PARAGRAPH')
            {
                 $s = $value['value'];
                 if ($s != "")
                 {
                     $result[$item]['type']='text';
                     $result[$item]['value']=$s;
                     $item++;
                 }
                 if (endTag($value))
                     return $result;
            }
            else if ($value['tag'] == 'ITEMIZEDLIST')
            {
                 readHTMLItem($value, $result, $item, 'ul');
            }
            else if ($value['tag'] == 'LISTITEM')
            {
                 readHTMLItem($value, $result, $item, 'li');
            }
            else if ($value['tag'] == 'OL')
            {
                 readHTMLItem($value, $result, $item, 'ol');
            }
            else if ($value['tag'] == 'UL')
            {
                 readHTMLItem($value, $result, $item, 'ul');
            }
            else if ($value['tag'] == 'LI')
            {
                 readHTMLItem($value, $result, $item, 'li');
            }
            else
            {
                 $s = $value['value'];
                 if ($s != "")
                 {
                     $result[$item]['type']='text';
                     $result[$item]['value']=$s;
                     $item++;
                 }
            }
        }
        return $result;
    }
    function parseChapter(&$values, &$counter, $depthChapter, &$chapterNum,
                              &$figureNum, &$book)
    {
        $value = $values[$counter];
        $attributes = getAttributes($value);
		if ($attributes["chapternum"] != -1)
			$chapterNum[$depthChapter]++;
		$depthChapter++;
        $chapterNum[$depthChapter]=0;
        
        $result['title'] = array();
        $item = array();
        $item['value'] = $attributes['title'];
        $item['type'] = 'text';
        $result['title'][] = $item;
        if ($attributes["chapternum"] != "-1")
			$result['chapterNum'] = printChapterNum($chapterNum);
        $result['chapterDepth'] = $depthChapter;
        $result['struct'] = array();
        $item=0;

        if (endTag($value))
            return $result;
        $counter++;
        for (; $counter < count($values); $counter++)
        {
            $value = $values[$counter];
            $attributes = getAttributes($value);
            if ($value['tag'] == 'TITLE')
                 $result['title'] = readHtml($values, $counter, 'TITLE');
            if ($value['tag'] == 'PARAGRAPH')
            {
                 $result['struct'][$item]['type'] = 'paragraph';
                 $result['struct'][$item]['value'] =
                    parseParagraph($values, $counter, $figureNum,
                       $book['figures'], $book['sources'], $book['keywords']);
                 $item++;
            }
            if ($value['tag'] == 'CHAPTER')
            {
                 if (endTag($value))
                 {
                     return $result;
                 }
                 else if (beginTag($value))
                 {
                     $result['struct'][$item]['type'] = 'chapter';
                     $result['struct'][$item]['value'] =
                         parseChapter($values, $counter, $depthChapter, $chapterNum,
                                  $figureNum, $book);
                     $item++;
                 }
            }
         }
    }

	function cmpKeyword($a, $b)
	{
		if (strlen($a) == strlen($b)) return 0;
		return (strlen($a) > strlen($b)) ? -1 : 1;
	}
	function cmpShort($a, $b)
	{
		if ($a == $b) return 0;
		return ($a < $b) ? -1 : 1;
	}

    function parseBook($filename, $printword)
    {
        $counter = 0;
        $depthChapter=0;
        $chapterNum=array();
        $chapterNum[0] = 0;
        $figureNum=array();
        $figureNum[0] = 0;
        $book = array();
        $book['sources'] = array();
        $book['figures'] = array();
        $book['keywords'] = array();
        $book['chapters'] = array();
        $item = 0;

        $data = implode("",file($filename));
        $parser = xml_parser_create();
        xml_parse_into_struct($parser,$data,$values,$tags);
        xml_parser_free($parser);

        $value = $values[$counter];
        $attributes = getAttributes($value);
        if ($attributes['title'] != '')
            $book['title'] = $attributes['title'];

        for (; $counter < count($values); $counter++)
        {
            $value = $values[$counter];
            $attributes = getAttributes($value);
            if ($value['tag'] == 'TITLE')
                 $book['title'] = readHtml($values, $counter, 'TITLE');
	
	        if ($value['tag'] == 'HIDE' && !$printword)
			{
				while (!(endtag($value) && $value['tag'] == 'HIDE'))
				{
					$counter++;
					$value = $values[$counter];
            	}
			}
  
            if ($value['tag'] == 'CHAPTER')
            {
                $figureNum[0]++;
                $figureNum[1] = 0;
                $book['chapters'][$item] =
                     parseChapter($values, $counter, $depthChapter, $chapterNum,
                              $figureNum, $book);
                $item++;
            }
            if ($value['tag'] == 'BOOK' && endTag($value))
            {
				 uksort($book['sources'], cmpShort);
				 uksort($book['keywords'], cmpKeyword);
                 buildIndex($book, $printword);
                 return $book;
            }
        }
        return $book;
    }
?>