merge from 1.5.3.2 (~r11225)

git-svn-id: http://xe-core.googlecode.com/svn/trunk@11226 201d5d3c-b55e-5fd7-737f-ddc643e51545
This commit is contained in:
flyskyko 2012-09-13 06:14:45 +00:00
parent 54e3a72065
commit 77f5aa2671
313 changed files with 8058 additions and 14251 deletions

View file

@ -0,0 +1,7 @@
/ex_dumptags.php/1.2/Wed Oct 29 16:42:53 2003//
/ex_dumpurl.php/1.3/Fri Apr 16 15:52:39 2004//
/ex_html2text.php/1.2/Fri Apr 16 15:52:39 2004//
/html2text.inc/1.3/Wed Oct 29 16:42:53 2003//
/htmlparser.inc/1.5/Fri Apr 16 15:52:39 2004//
/testfile.html/1.2/Fri Apr 16 15:52:39 2004//
D

View file

@ -0,0 +1,6 @@
/ex_dumptags.php////
/ex_dumpurl.php////
/ex_html2text.php////
/html2text.inc////
/htmlparser.inc////
/testfile.html////

View file

@ -0,0 +1 @@
phphtmlparser/src

View file

@ -0,0 +1 @@
:ext:jhsolorz@cvs.sourceforge.net:/cvsroot/php-html

View file

@ -0,0 +1,11 @@
<?
include ("htmlparser.inc");
$htmlText = "<html><!-- comment --><body>This is the body</body></html>";
$parser = new HtmlParser($htmlText);
while ($parser->parse()) {
echo "-----------------------------------\r\n";
echo "Node type: " . $parser->iNodeType . "\r\n";
echo "Node name: " . $parser->iNodeName . "\r\n";
echo "Node value: " . $parser->iNodeValue . "\r\n";
}
?>

View file

@ -0,0 +1,29 @@
<?
// Example:
// Dumps nodes from testfile.html.
// To run: php < ex_dumpurl.php
include ("htmlparser.inc");
$parser = HtmlParser_ForFile ("testfile.html");
//$parser = HtmlParser_ForURL ("http://yahoo.com");
while ($parser->parse()) {
echo "-----------------------------------\r\n";
echo "Name=" . $parser->iNodeName . ";";
echo "Type=" . $parser->iNodeType . ";";
if ($parser->iNodeType == NODE_TYPE_TEXT || $parser->iNodeType == NODE_TYPE_COMMENT) {
echo "Value='" . $parser->iNodeValue . "'";
}
echo "\r\n";
if ($parser->iNodeType == NODE_TYPE_ELEMENT) {
echo "ATTRIBUTES: ";
$attrValues = $parser->iNodeAttributes;
$attrNames = array_keys($attrValues);
$size = count($attrNames);
for ($i = 0; $i < $size; $i++) {
$name = $attrNames[$i];
echo $attrNames[$i] . "=\"" . $attrValues[$name] . "\" ";
}
}
echo "\r\n";
}
?>

View file

@ -0,0 +1,18 @@
<?
// Example: html2text
// Converts HTML to formatted ASCII text.
// Run with: php < ex_html2text.php
include ("html2text.inc");
$htmlText = "Html2text is a tool that allows you to<br>" .
"convert HTML to text.<p>" .
"Does it work?";
$htmlToText = new Html2Text ($htmlText, 15);
$text = $htmlToText->convert();
echo "Conversion follows:\r\n";
echo "-------------------\r\n";
echo $text;
?>

View file

@ -0,0 +1,214 @@
<?
/*
* Copyright (c) 2003 Jose Solorzano. All rights reserved.
* Redistribution of source must retain this copyright notice.
*/
include ("htmlparser.inc");
/**
* Class Html2Text. (HtmlParser example.)
* Converts HTML to ASCII attempting to preserve
* document structure.
* To use, create an instance of Html2Text passing
* the text to convert and the desired maximum
* number of characters per line. Then invoke
* convert() which returns ASCII text.
*/
class Html2Text {
// Private fields
var $iCurrentLine = "";
var $iCurrentWord = "";
var $iCurrentWordArray;
var $iCurrentWordIndex;
var $iInScript;
var $iListLevel = 0;
var $iHtmlText;
var $iMaxColumns;
var $iHtmlParser;
// Constants
var $TOKEN_BR = 0;
var $TOKEN_P = 1;
var $TOKEN_LI = 2;
var $TOKEN_AFTERLI = 3;
var $TOKEN_UL = 4;
var $TOKEN_ENDUL = 5;
function Html2Text ($aHtmlText, $aMaxColumns) {
$this->iHtmlText = $aHtmlText;
$this->iMaxColumns = $aMaxColumns;
}
function convert() {
$this->iHtmlParser = new HtmlParser($this->iHtmlText);
$wholeText = "";
while (($line = $this->getLine()) !== false) {
$wholeText .= ($line . "\r\n");
}
return $wholeText;
}
function getLine() {
while (true) {
if (!$this->addWordToLine($this->iCurrentWord)) {
$retvalue = $this->iCurrentLine;
$this->iCurrentLine = "";
return $retvalue;
}
$word = $this->getWord();
if ($word === false) {
if ($this->iCurrentLine == "") {
break;
}
$retvalue = $this->iCurrentLine;
$this->iCurrentLine = "";
$this->iInText = false;
$this->iCurrentWord = "";
return $retvalue;
}
}
return false;
}
function addWordToLine ($word) {
if ($this->iInScript) {
return true;
}
$prevLine = $this->iCurrentLine;
if ($word === $this->TOKEN_BR) {
$this->iCurrentWord = "";
return false;
}
if ($word === $this->TOKEN_P) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
}
if ($word === $this->TOKEN_UL) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
}
if ($word === $this->TOKEN_ENDUL) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
}
if ($word === $this->TOKEN_LI) {
$this->iCurrentWord = $this->TOKEN_AFTERLI;
return false;
}
$toAdd = $word;
if ($word === $this->TOKEN_AFTERLI) {
$toAdd = "";
}
if ($prevLine != "") {
$prevLine .= " ";
}
else {
$prevLine = $this->getIndentation($word === $this->TOKEN_AFTERLI);
}
$candidateLine = $prevLine . $toAdd;
if (strlen ($candidateLine) > $this->iMaxColumns && $prevLine != "") {
return false;
}
$this->iCurrentLine = $candidateLine;
return true;
}
function getWord() {
while (true) {
if ($this->iHtmlParser->iNodeType == NODE_TYPE_TEXT) {
if (!$this->iInText) {
$words = $this->splitWords($this->iHtmlParser->iNodeValue);
$this->iCurrentWordArray = $words;
$this->iCurrentWordIndex = 0;
$this->iInText = true;
}
if ($this->iCurrentWordIndex < count($this->iCurrentWordArray)) {
$this->iCurrentWord = $this->iCurrentWordArray[$this->iCurrentWordIndex++];
return $this->iCurrentWord;
}
else {
$this->iInText = false;
}
}
else if ($this->iHtmlParser->iNodeType == NODE_TYPE_ELEMENT) {
if (strcasecmp ($this->iHtmlParser->iNodeName, "br") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = $this->TOKEN_BR;
return $this->iCurrentWord;
}
else if (strcasecmp ($this->iHtmlParser->iNodeName, "p") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = $this->TOKEN_P;
return $this->iCurrentWord;
}
else if (strcasecmp ($this->iHtmlParser->iNodeName, "script") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = "";
$this->iInScript = true;
return $this->iCurrentWord;
}
else if (strcasecmp ($this->iHtmlParser->iNodeName, "ul") == 0 || strcasecmp ($this->iHtmlParser->iNodeName, "ol") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = $this->TOKEN_UL;
$this->iListLevel++;
return $this->iCurrentWord;
}
else if (strcasecmp ($this->iHtmlParser->iNodeName, "li") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = $this->TOKEN_LI;
return $this->iCurrentWord;
}
}
else if ($this->iHtmlParser->iNodeType == NODE_TYPE_ENDELEMENT) {
if (strcasecmp ($this->iHtmlParser->iNodeName, "script") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = "";
$this->iInScript = false;
return $this->iCurrentWord;
}
else if (strcasecmp ($this->iHtmlParser->iNodeName, "ul") == 0 || strcasecmp ($this->iHtmlParser->iNodeName, "ol") == 0) {
$this->iHtmlParser->parse();
$this->iCurrentWord = $this->TOKEN_ENDUL;
if ($this->iListLevel > 0) {
$this->iListLevel--;
}
return $this->iCurrentWord;
}
}
if (!$this->iHtmlParser->parse()) {
break;
}
}
return false;
}
function splitWords ($text) {
$words = split ("[ \t\r\n]+", $text);
for ($idx = 0; $idx < count($words); $idx++) {
$words[$idx] = $this->htmlDecode($words[$idx]);
}
return $words;
}
function htmlDecode ($text) {
// TBD
return $text;
}
function getIndentation ($hasLI) {
$indent = "";
$idx = 0;
for ($idx = 0; $idx < ($this->iListLevel - 1); $idx++) {
$indent .= " ";
}
if ($this->iListLevel > 0) {
$indent = $hasLI ? ($indent . "- ") : ($indent . " ");
}
return $indent;
}
}

View file

@ -0,0 +1,365 @@
<?php
/*
* Copyright (c) 2003 Jose Solorzano. All rights reserved.
* Redistribution of source must retain this copyright notice.
*
* Jose Solorzano (http://jexpert.us) is a software consultant.
*
* Contributions by:
* - Leo West (performance improvements)
*/
define ("NODE_TYPE_START",0);
define ("NODE_TYPE_ELEMENT",1);
define ("NODE_TYPE_ENDELEMENT",2);
define ("NODE_TYPE_TEXT",3);
define ("NODE_TYPE_COMMENT",4);
define ("NODE_TYPE_DONE",5);
/**
* Class HtmlParser.
* To use, create an instance of the class passing
* HTML text. Then invoke parse() until it's false.
* When parse() returns true, $iNodeType, $iNodeName
* $iNodeValue and $iNodeAttributes are updated.
*
* To create an HtmlParser instance you may also
* use convenience functions HtmlParser_ForFile
* and HtmlParser_ForURL.
*/
class HtmlParser {
/**
* Field iNodeType.
* May be one of the NODE_TYPE_* constants above.
*/
var $iNodeType;
/**
* Field iNodeName.
* For elements, it's the name of the element.
*/
var $iNodeName = "";
/**
* Field iNodeValue.
* For text nodes, it's the text.
*/
var $iNodeValue = "";
/**
* Field iNodeAttributes.
* A string-indexed array containing attribute values
* of the current node. Indexes are always lowercase.
*/
var $iNodeAttributes;
// The following fields should be
// considered private:
var $iHtmlText;
var $iHtmlTextLength;
var $iHtmlTextIndex = 0;
var $iHtmlCurrentChar;
var $BOE_ARRAY;
var $B_ARRAY;
var $BOS_ARRAY;
/**
* Constructor.
* Constructs an HtmlParser instance with
* the HTML text given.
*/
function HtmlParser ($aHtmlText) {
$this->iHtmlText = $aHtmlText;
$this->iHtmlTextLength = strlen($aHtmlText);
$this->iNodeAttributes = array();
$this->setTextIndex (0);
$this->BOE_ARRAY = array (" ", "\t", "\r", "\n", "=" );
$this->B_ARRAY = array (" ", "\t", "\r", "\n" );
$this->BOS_ARRAY = array (" ", "\t", "\r", "\n", "/" );
}
/**
* Method parse.
* Parses the next node. Returns false only if
* the end of the HTML text has been reached.
* Updates values of iNode* fields.
*/
function parse() {
$text = $this->skipToElement();
if ($text != "") {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = $text;
return true;
}
return $this->readTag();
}
function clearAttributes() {
$this->iNodeAttributes = array();
}
function readTag() {
if ($this->iCurrentChar != "<") {
$this->iNodeType = NODE_TYPE_DONE;
return false;
}
$this->clearAttributes();
$this->skipMaxInTag ("<", 1);
if ($this->iCurrentChar == '/') {
$this->moveNext();
$name = $this->skipToBlanksInTag();
$this->iNodeType = NODE_TYPE_ENDELEMENT;
$this->iNodeName = $name;
$this->iNodeValue = "";
$this->skipEndOfTag();
return true;
}
$name = $this->skipToBlanksOrSlashInTag();
if (!$this->isValidTagIdentifier ($name)) {
$comment = false;
if (strpos($name, "!--") === 0) {
$ppos = strpos($name, "--", 3);
if (strpos($name, "--", 3) === (strlen($name) - 2)) {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . ">";
$comment = true;
}
else {
$rest = $this->skipToStringInTag ("-->");
if ($rest != "") {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . $rest;
$comment = true;
// Already skipped end of tag
return true;
}
}
}
if (!$comment) {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = "<" . $name;
return true;
}
}
else {
$this->iNodeType = NODE_TYPE_ELEMENT;
$this->iNodeValue = "";
$this->iNodeName = $name;
while ($this->skipBlanksInTag()) {
$attrName = $this->skipToBlanksOrEqualsInTag();
if ($attrName != "" && $attrName != "/") {
$this->skipBlanksInTag();
if ($this->iCurrentChar == "=") {
$this->skipEqualsInTag();
$this->skipBlanksInTag();
$value = $this->readValueInTag();
$this->iNodeAttributes[strtolower($attrName)] = $value;
}
else {
$this->iNodeAttributes[strtolower($attrName)] = "";
}
}
}
}
$this->skipEndOfTag();
return true;
}
function isValidTagIdentifier ($name) {
return ereg ("^[A-Za-z0-9_\\-]+$", $name);
}
function skipBlanksInTag() {
return "" != ($this->skipInTag ($this->B_ARRAY));
}
function skipToBlanksOrEqualsInTag() {
return $this->skipToInTag ($this->BOE_ARRAY);
}
function skipToBlanksInTag() {
return $this->skipToInTag ($this->B_ARRAY);
}
function skipToBlanksOrSlashInTag() {
return $this->skipToInTag ($this->BOS_ARRAY);
}
function skipEqualsInTag() {
return $this->skipMaxInTag ("=", 1);
}
function readValueInTag() {
$ch = $this->iCurrentChar;
$value = "";
if ($ch == "\"") {
$this->skipMaxInTag ("\"", 1);
$value = $this->skipToInTag ("\"");
$this->skipMaxInTag ("\"", 1);
}
else if ($ch == "'") {
$this->skipMaxInTag ("'", 1);
$value = $this->skipToInTag ("'");
$this->skipMaxInTag ("'", 1);
}
else {
$value = $this->skipToBlanksInTag();
}
return $value;
}
function setTextIndex ($index) {
$this->iHtmlTextIndex = $index;
if ($index >= $this->iHtmlTextLength) {
$this->iCurrentChar = -1;
}
else {
$this->iCurrentChar = $this->iHtmlText{$index};
}
}
function moveNext() {
if ($this->iHtmlTextIndex < $this->iHtmlTextLength) {
$this->setTextIndex ($this->iHtmlTextIndex + 1);
return true;
}
else {
return false;
}
}
function skipEndOfTag() {
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
$this->moveNext();
return;
}
$this->moveNext();
}
}
function skipInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipMaxInTag ($chars, $maxChars) {
$sb = "";
$count = 0;
while (($ch = $this->iCurrentChar) !== -1 && $count++ < $maxChars) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipToInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
$match = $ch == ">";
if (!$match) {
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
}
if ($match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
function skipToElement() {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == "<") {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
/**
* Returns text between current position and $needle,
* inclusive, or "" if not found. The current index is moved to a point
* after the location of $needle, or not moved at all
* if nothing is found.
*/
function skipToStringInTag ($needle) {
$pos = strpos ($this->iHtmlText, $needle, $this->iHtmlTextIndex);
if ($pos === false) {
return "";
}
$top = $pos + strlen($needle);
$retvalue = substr ($this->iHtmlText, $this->iHtmlTextIndex, $top - $this->iHtmlTextIndex);
$this->setTextIndex ($top);
return $retvalue;
}
}
function HtmlParser_ForFile ($fileName) {
return HtmlParser_ForURL($fileName);
}
function HtmlParser_ForURL ($url) {
$fp = fopen ($url, "r");
$content = "";
while (true) {
$data = fread ($fp, 8192);
if (strlen($data) == 0) {
break;
}
$content .= $data;
}
fclose ($fp);
return new HtmlParser ($content);
}
php?>

View file

@ -0,0 +1,8 @@
<!-- first comment --> <!-- second comment -->
<elem attribute1="foobar" attribute2=""/>Text After Elem
<!--comment1-->
<elem2>Text</elem2>
<!-- comment2-->
<elem3 attribute3='insinglequotes'/>
<!--comment3 -->Text between comments<!-- comment4 -->
<elem4/>