My Wiki Parser Class for PHP

I was looking for a script which could parse wiki syntax which is used in Google Code (see http://code.google.com/p/support/wiki/WikiSyntax). I could only find MediaWiki parsing, which isn’t what I wanted. The closest I could find was a PHP class which parsed ‘wiki’ syntax, but had quite a few bugs in (http://www.phpclasses.org/package/3111-PHP-Retrieve-and-process-articles-from-Wikipedia.html). This was my starting point.

I used the WikiSyntax page given above to add some of the features that I wanted to add. I didn’t add all of them, because I didn’t need all of them, but you can edit the code to suit your needs if you want to.

<?php
class WikiParser {
private $preformat=false;
private $deflist=false;
private $list_level=0;
private $list_level_types=array();
private $table=false;
private $tr=false;
private $headers=array();
private $line='';
private $processing='';
private $lproc=array();
private $codeblocks=array();
private $link=0;
private $inp=false;
private $was_li=false;
private $codeblocks_ptr=-1;
function WikiParser() {
$this->reference_wiki = '/help/docs/';
$this->image_uri = '';
$this->ignore_images = true;
}
function handle_sections($matches) {
$level = strlen($matches[1]);
$content = $matches[2];
$slug=preg_replace('/\W/','_',strtolower(trim($content)));
$this->stop = true;
$this->stop_all = true;
$this->headers[]=array('name'=>$content,'slug'=>$slug,'level'=>$level);
$var='';
if($this->inp) {$var='</p>';$this->inp=false;}
return "$var\n<h{$level}><a name=\"$slug\"></a>{$content}</h{$level}>\n\n";
}
function handle_newline($matches) {
if($this->table) {
return $this->table(true);
}else{
$this->stop = true;
$var='';
if($this->inp) {$var='</p>';}
$var.='<p>';
return $var;
}
}
function handle_list($matches,$close=false) {
$listtypes = array(
'*'=>'ul',
'#'=>'ol',
);
$output="";
$newlevel = ($close) ? 0 : strlen($matches[1]);
while ($this->list_level!=$newlevel) {
if ($this->list_level>$newlevel) {
$listtype = '/'.array_pop($this->list_level_types);
$this->list_level--;
} else {
$listchar = substr($matches[2],-1);
$listtype = $listtypes[$listchar];
$this->list_level++;
array_push($this->list_level_types,$listtype);
}
$output .= "\n<{$listtype}>\n";
}
if ($close) return $output;
$output .= "<li>".trim($matches[3]);
return $output;
}
function handle_preformat($matches,$close=false) {
if ($close) {
$this->preformat = false;
return "</pre>\n";
}
$this->stop_all = true;
$output = "";
if (!$this->preformat) $output .= "<pre>";
$this->preformat = true;
$output .= $matches[1];
return $output."\n";
}
function handle_horizontalrule($matches) {
return "<hr />";
}
function small_off() {
return '';
}
function handle_eliminate($matches) {
return "";
}
function table($close=false) {
if($close) {
if($this->table) {
$this->table=false;
return "</table>";
}
}
}
function handle_table($matches) {
$this->stop_all=true;
$row=explode('||',$matches[1]);
$output='';
if(!$this->table) {
$output.='<table>';
$this->table=true;
}
$output.='<tr valign="top">';
foreach($row as $cell) {
$output.='<td style="border:1px solid #BBBBBB;padding:3px;margin:3px;">'.$this->parse_chars($cell).'</td>';
}
$output.='</tr>';
return $output;
}
function in_string($needle,$haystack) {
$pos = strpos($haystack,$needle);
if($pos === false) {
return false;
}else{
return true;
}
}
function parse_char($char) {
$listeners=array('*','`','_');
if(in_array($char,$listeners) and $this->link==0) {
if(in_array($char,$this->lproc)) {
if(!in_array('`',$this->lproc) or $char=='`') {
$arr=explode($char, $this->processing);
$scode='';
$ecode='';
if($char=='*') {$scode='<b>';$ecode='</b>';}
if($char=='`') {$scode='<tt>';$ecode='</tt>';}
if($char=='_') {$scode='<em>';$ecode='</em>';}
$this->processing=$arr[0].$scode.$arr[1].$ecode;
if(strlen($arr[0])==0) {
$this->line.=$this->processing;
$this->processing='';
$this->lproc=array();
}else{
$p='';
while($p!=$char) {
$p=array_pop($this->lproc);
}
}
}else{
$this->processing.=$char;
}
}else{
$this->processing.=$char;
$this->lproc[]=$char;
}
}else{
if($this->link and $char==']') {
if($this->link==1) {
$arr=explode('[', $this->processing);
$target='_self';
$link=$arr[1];
$text=$arr[1];
if($this->in_string(' ',$arr[1])) {
$temp=explode(' ',$arr[1],2);
$link=$temp[0];
$text=$temp[1];
}
if($this->in_string('://',$link)) {
$target='_blank';
}else{
if(!$this->in_string('://',$link)) {
if($link==$text) {
$v1=explode('/',$link);
$v1=end($v1);
$v1=explode('#',$v1);
$v1=end($v1);
$text=$v1;
}
$link='/help/docs/'.$link;
}
}
$imgs=array('jpg','gif','bmp','png');
if(in_array(strtolower(substr($link,-3,3)),$imgs)) {
$this->processing=$arr[0].'<img src="'.$link.'" title="'.$text.'" />';
}else{
$this->processing=$arr[0].'<a href="'.$link.'" target="'.$target.'">'.$text.'</a>';
}
if(strlen($arr[0])==0) {
$this->line.=$this->processing;
$this->processing='';
$this->lproc=array();
}
$this->link=0;
}else{
$this->link=1;
}
}else{
if($char=='[' and !in_array('`',$this->lproc)) {
if(!$this->link) {
$this->link=1;
$this->processing.=$char;
}else{
$this->link=2;
}
}else{
if(strlen($this->processing)) {
$this->processing.=$char;
}else{
$this->line.=$char;
}
}
}
}
}
function parse_chars($line) {
$line=preg_replace('/(((http|https|ftp):\/\/([A-Z0-9][A-Z0-9_-]*(?:.[A-Z0-9][A-Z0-9_-]*)+):?(d+)?\/?))(?!([^<]+)?>)/i','[$0]',$line);
$this->line='';
$this->processing='';
$this->lproc=array();
$line = preg_split('//', $line, -1);
foreach($line as $char) {
$this->parse_char($char);
}
$this->line=preg_replace('/(\[(((http|https|ftp):\/\/([A-Z0-9][A-Z0-9_-]*(?:.[A-Z0-9][A-Z0-9_-]*)+):?(d+)?\/?))(?!([^<]+)?>)\])/i','$2',$this->line);
return $this->line.$this->processing;
}
function parse_line($line) {
$line_regexes = array(
'preformat'=>'^\s(^[\*\#])$',
'newline'=>'^$',
'list'=>'^(\s+)([\*\#]+)(.*?)$',
'sections'=>'^(={1,6})(.*?)(={1,6})$',
'horizontalrule'=>'^----$',
'table'=>'^\|\|(.*?)\|\|$'
);
$this->stop = false;
$this->stop_all = false;
$called = array();
$line = rtrim($line);
foreach ($line_regexes as $func=>$regex) {
if (preg_match("/$regex/i",$line,$matches)) {
$called[$func] = true;
$func = "handle_".$func;
$line = $this->$func($matches);
if ($this->stop || $this->stop_all) break;
}
}
if (!$this->stop_all) {
$this->stop=false;
$line=$this->parse_chars($line);
}
$isline = strlen(trim($line))>0;
// if this wasn't a list item, and we are in a list, close the list tag(s)
if (($this->list_level>0) && !isset($called['list'])) $line = $this->handle_list(false,true) . $line;
if ($this->preformat && !isset($called['preformat'])) $line = $this->handle_preformat(false,true) . $line;
// suppress linebreaks for the next line if we just displayed one; otherwise re-enable them
if ($isline) $this->suppress_linebreaks = (isset($called['newline']) || isset($called['sections']));
return $line."\n";
}
function parse($text,$title="") {
$this->redirect = false;
$this->nowikis = array();
$this->list_level_types = array();
$this->list_level = 0;
$this->deflist = false;
$this->linknumber = 0;
$this->suppress_linebreaks = false;
$this->page_title = $title;
$output = "";
$text = preg_replace_callback('/[\{]{3}([\s\S]+?)[\}]{3}/i',array(&$this,"handle_save_codeblock"),$text);
$lines = explode("\n",$text);
$lines[]='';
if (preg_match('/^\#REDIRECT\s+\[\[(.*?)\]\]$/',trim($lines[0]),$matches)) {
$this->redirect = $matches[1];
}
foreach ($lines as $k=>$line) {
$line = $this->parse_line($line);
$output.=$line;
}
$output=preg_replace_callback('/\{\{\{\}\}\}/i',array(&$this,"handle_restore_codeblock"),$output);
return array('contents'=>$output,'headers'=>$this->headers);
}
function handle_save_nowiki($matches) {
array_push($this->nowikis,$matches[1]);
return "<nowiki></nowiki>";
}
function handle_restore_nowiki($matches) {
return array_pop($this->nowikis);
}
function handle_save_codeblock($matches) {
array_push($this->codeblocks,$matches[1]);
return "{{{}}}";
}
function handle_restore_codeblock($matches) {
$this->codeblocks_ptr++;
return preg_replace_callback('/([\s]{2,})/i',array(&$this,"sp_2_nbsp"),str_replace('&nbsp;',' ',highlight_string(trim($this->codeblocks[$this->codeblocks_ptr]),true)));
}
function sp_2_nbsp($matches) {
$output="";
for($x=1;$x<=strlen(current($matches));$x++){
$output.="&nbsp;";
}
return $output;
}
}
?>

I hope I haven’t forgotten any key part of the code here! If you have any problems, let me know!

The script returns an array with 2 parts: contents and headers. Contents has the parsed HTML and headers contains an array of the titles within the page (which are all assigned anchors), so you can have a ‘page contents’ list if you like.

Enjoy!

About these ads

4 Responses to “My Wiki Parser Class for PHP”

  1. Jack Says:

    Hi!

    Very Nice Solution but no work overall.
    Best Example is the PHP Wiki Site!
    Tables are not parsed!

  2. Edmund Gentle Says:

    Tables do get parsed by this.

    Simply use || for the beginning and end of each cell.

  3. j0k Says:

    Hi there,

    This code generated 2 table instead of one table with 2 tr. Strange …

    || Your text here…|| Your text here… ||
    || Your text here…|| Your text here… ||


Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

%d bloggers like this: