2021-02-26 15:31:59 +08:00
< ? php
/**
* Website : http :// sourceforge . net / projects / simplehtmldom /
* Acknowledge : Jose Solorzano ( https :// sourceforge . net / projects / php - html / )
* Contributions by :
* Yousuke Kumakura ( Attribute filters )
* Vadim Voituk ( Negative indexes supports of " find " method )
* Antcs ( Constructor with automatically load contents either text or file / url )
*
* all affected sections have comments starting with " PaperG "
*
* Paperg - Added case insensitive testing of the value of the selector .
* Paperg - Added tag_start for the starting index of tags - NOTE : This works but not accurately .
* This tag_start gets counted AFTER \r\n have been crushed out , and after the remove_noice calls so it will not reflect the REAL position of the tag in the source ,
* it will almost always be smaller by some amount .
* We use this to determine how far into the file the tag in question is . This " percentage will never be accurate as the $dom->size is the " real " number of bytes the dom was created from.
* but for most purposes , it ' s a really good estimation .
* Paperg - Added the forceTagsClosed to the dom constructor . Forcing tags closed is great for malformed html , but it CAN lead to parsing errors .
* Allow the user to tell us how much they trust the html .
* Paperg add the text and plaintext to the selectors for the find syntax . plaintext implies text in the innertext of a node . text implies that the tag is a text node .
* This allows for us to find tags based on the text they contain .
* Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag .
* Paperg : added parse_charset so that we know about the character set of the source document .
* NOTE : If the user 's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it' s returning the content - type header from the
* last transfer or curl_exec , and we will parse that and use it in preference to any other method of charset detection .
*
* Found infinite loop in the case of broken html in restore_noise . Rewrote to protect from that .
* PaperG ( John Schlick ) Added get_display_size for " IMG " tags .
*
* Licensed under The MIT License
* Redistributions of files must retain the above copyright notice .
*
* @ author S . C . Chen < me578022 @ gmail . com >
* @ author John Schlick
* @ author Rus Carroll
* @ version 1.5 ( $Rev : 196 $ )
* @ package PlaceLocalInclude
* @ subpackage simple_html_dom
*/
/**
* All of the Defines for the classes below .
* @ author S . C . Chen < me578022 @ gmail . com >
*/
define ( 'HDOM_TYPE_ELEMENT' , 1 );
define ( 'HDOM_TYPE_COMMENT' , 2 );
2021-03-07 13:31:53 +08:00
define ( 'HDOM_TYPE_TEXT' , 3 );
define ( 'HDOM_TYPE_ENDTAG' , 4 );
define ( 'HDOM_TYPE_ROOT' , 5 );
2021-02-26 15:31:59 +08:00
define ( 'HDOM_TYPE_UNKNOWN' , 6 );
define ( 'HDOM_QUOTE_DOUBLE' , 0 );
define ( 'HDOM_QUOTE_SINGLE' , 1 );
2021-03-07 13:31:53 +08:00
define ( 'HDOM_QUOTE_NO' , 3 );
define ( 'HDOM_INFO_BEGIN' , 0 );
define ( 'HDOM_INFO_END' , 1 );
define ( 'HDOM_INFO_QUOTE' , 2 );
define ( 'HDOM_INFO_SPACE' , 3 );
define ( 'HDOM_INFO_TEXT' , 4 );
define ( 'HDOM_INFO_INNER' , 5 );
define ( 'HDOM_INFO_OUTER' , 6 );
define ( 'HDOM_INFO_ENDSPACE' , 7 );
2021-02-26 15:31:59 +08:00
define ( 'DEFAULT_TARGET_CHARSET' , 'UTF-8' );
define ( 'DEFAULT_BR_TEXT' , " \r \n " );
define ( 'DEFAULT_SPAN_TEXT' , " " );
define ( 'MAX_FILE_SIZE' , 600000 );
// helper functions
// -----------------------------------------------------------------------------
// get html dom from file
// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
2021-03-07 13:31:53 +08:00
function file_get_html ( $url , $use_include_path = false , $context = null , $offset = - 1 , $maxLen = - 1 , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT , $defaultSpanText = DEFAULT_SPAN_TEXT ) {
2021-02-26 15:31:59 +08:00
// We DO force the tags to be terminated.
$dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
// For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
$contents = file_get_contents ( $url , $use_include_path , $context , $offset );
// Paperg - use our own mechanism for getting the contents as we want to control the timeout.
//$contents = retrieve_url_contents($url);
2021-03-07 13:31:53 +08:00
if ( empty ( $contents ) || strlen ( $contents ) > MAX_FILE_SIZE ) {
2021-02-26 15:31:59 +08:00
return false ;
}
// The second parameter can force the selectors to all be lowercase.
$dom -> load ( $contents , $lowercase , $stripRN );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $dom ;
}
// get html dom from string
2021-03-07 13:31:53 +08:00
function str_get_html ( $str , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT , $defaultSpanText = DEFAULT_SPAN_TEXT ) {
2021-02-26 15:31:59 +08:00
$dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
2021-03-07 13:31:53 +08:00
if ( empty ( $str ) || strlen ( $str ) > MAX_FILE_SIZE ) {
2021-02-26 15:31:59 +08:00
$dom -> clear ();
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return false ;
}
$dom -> load ( $str , $lowercase , $stripRN );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $dom ;
}
// dump html dom tree
2021-03-07 13:31:53 +08:00
function dump_html_tree ( $node , $show_attr = true , $deep = 0 ) {
2021-02-26 15:31:59 +08:00
$node -> dump ( $node );
}
/**
* simple html dom node
* PaperG - added ability for " find " routine to lowercase the value of the selector .
* PaperG - added $tag_start to track the start position of the tag in the total byte index
*
* @ package PlaceLocalInclude
*/
2021-03-07 13:31:53 +08:00
class simple_html_dom_node {
2021-02-26 15:31:59 +08:00
public $nodetype = HDOM_TYPE_TEXT ;
public $tag = 'text' ;
public $attr = array ();
public $children = array ();
public $nodes = array ();
public $parent = null ;
// The "info" array - see HDOM_INFO_... for what each element contains.
public $_ = array ();
public $tag_start = 0 ;
private $dom = null ;
2021-03-07 13:31:53 +08:00
function __construct ( $dom ) {
$this -> dom = $dom ;
2021-02-26 15:31:59 +08:00
$dom -> nodes [] = $this ;
}
2021-03-07 13:31:53 +08:00
function __destruct () {
2021-02-26 15:31:59 +08:00
$this -> clear ();
}
2021-03-07 13:31:53 +08:00
function __toString () {
2021-02-26 15:31:59 +08:00
return $this -> outertext ();
}
// clean up memory due to php5 circular references memory leak...
2021-03-07 13:31:53 +08:00
function clear () {
$this -> dom = null ;
$this -> nodes = null ;
$this -> parent = null ;
2021-02-26 15:31:59 +08:00
$this -> children = null ;
}
// dump node's tree
2021-03-07 13:31:53 +08:00
function dump ( $show_attr = true , $deep = 0 ) {
2021-02-26 15:31:59 +08:00
$lead = str_repeat ( ' ' , $deep );
2021-03-07 13:31:53 +08:00
echo $lead . $this -> tag ;
if ( $show_attr && count ( $this -> attr ) > 0 ) {
2021-02-26 15:31:59 +08:00
echo '(' ;
2021-03-07 13:31:53 +08:00
foreach ( $this -> attr as $k => $v ) {
echo " [ $k ]=> \" " . $this -> $k . '", ' ;
}
2021-02-26 15:31:59 +08:00
echo ')' ;
}
echo " \n " ;
2021-03-07 13:31:53 +08:00
if ( $this -> nodes ) {
foreach ( $this -> nodes as $c ) {
$c -> dump ( $show_attr , $deep + 1 );
2021-02-26 15:31:59 +08:00
}
}
}
// Debugging function to dump a single dom node with a bunch of information about it.
2021-03-07 13:31:53 +08:00
function dump_node ( $echo = true ) {
2021-02-26 15:31:59 +08:00
$string = $this -> tag ;
2021-03-07 13:31:53 +08:00
if ( count ( $this -> attr ) > 0 ) {
2021-02-26 15:31:59 +08:00
$string .= '(' ;
2021-03-07 13:31:53 +08:00
foreach ( $this -> attr as $k => $v ) {
$string .= " [ $k ]=> \" " . $this -> $k . '", ' ;
2021-02-26 15:31:59 +08:00
}
$string .= ')' ;
}
2021-03-07 13:31:53 +08:00
if ( count ( $this -> _ ) > 0 ) {
2021-02-26 15:31:59 +08:00
$string .= ' $_ (' ;
2021-03-07 13:31:53 +08:00
foreach ( $this -> _ as $k => $v ) {
if ( is_array ( $v )) {
2021-02-26 15:31:59 +08:00
$string .= " [ $k ]=>( " ;
2021-03-07 13:31:53 +08:00
foreach ( $v as $k2 => $v2 ) {
$string .= " [ $k2 ]=> \" " . $v2 . '", ' ;
2021-02-26 15:31:59 +08:00
}
$string .= " ) " ;
} else {
2021-03-07 13:31:53 +08:00
$string .= " [ $k ]=> \" " . $v . '", ' ;
2021-02-26 15:31:59 +08:00
}
}
$string .= " ) " ;
}
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> text )) {
2021-02-26 15:31:59 +08:00
$string .= " text: ( " . $this -> text . " ) " ;
}
$string .= " HDOM_INNER_INFO: ' " ;
2021-03-07 13:31:53 +08:00
if ( isset ( $node -> _ [ HDOM_INFO_INNER ])) {
2021-02-26 15:31:59 +08:00
$string .= $node -> _ [ HDOM_INFO_INNER ] . " ' " ;
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
$string .= ' NULL ' ;
}
$string .= " children: " . count ( $this -> children );
$string .= " nodes: " . count ( $this -> nodes );
$string .= " tag_start: " . $this -> tag_start ;
$string .= " \n " ;
2021-03-07 13:31:53 +08:00
if ( $echo ) {
2021-02-26 15:31:59 +08:00
echo $string ;
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return ;
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
return $string ;
}
}
// returns the parent of node
// If a node is passed in, it will reset the parent of the current node to that one.
2021-03-07 13:31:53 +08:00
function parent ( $parent = null ) {
2021-02-26 15:31:59 +08:00
// I am SURE that this doesn't work properly.
// It fails to unset the current node from it's current parents nodes or children list first.
2021-03-07 13:31:53 +08:00
if ( $parent !== null ) {
$this -> parent = $parent ;
$this -> parent -> nodes [] = $this ;
2021-02-26 15:31:59 +08:00
$this -> parent -> children [] = $this ;
}
return $this -> parent ;
}
// verify that node has children
2021-03-07 13:31:53 +08:00
function has_child () {
return ! empty ( $this -> children );
2021-02-26 15:31:59 +08:00
}
// returns children of node
2021-03-07 13:31:53 +08:00
function children ( $idx = - 1 ) {
if ( $idx === - 1 ) {
2021-02-26 15:31:59 +08:00
return $this -> children ;
}
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> children [ $idx ])) {
return $this -> children [ $idx ];
}
2021-02-26 15:31:59 +08:00
return null ;
}
// returns the first child of node
2021-03-07 13:31:53 +08:00
function first_child () {
if ( count ( $this -> children ) > 0 ) {
2021-02-26 15:31:59 +08:00
return $this -> children [ 0 ];
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return null ;
}
// returns the last child of node
2021-03-07 13:31:53 +08:00
function last_child () {
if (( $count = count ( $this -> children )) > 0 ) {
return $this -> children [ $count - 1 ];
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return null ;
}
// returns the next sibling of node
2021-03-07 13:31:53 +08:00
function next_sibling () {
if ( $this -> parent === null ) {
2021-02-26 15:31:59 +08:00
return null ;
}
2021-03-07 13:31:53 +08:00
$idx = 0 ;
2021-02-26 15:31:59 +08:00
$count = count ( $this -> parent -> children );
2021-03-07 13:31:53 +08:00
while ( $idx < $count && $this !== $this -> parent -> children [ $idx ]) {
++ $idx ;
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
if ( ++ $idx >= $count ) {
2021-02-26 15:31:59 +08:00
return null ;
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $this -> parent -> children [ $idx ];
}
// returns the previous sibling of node
2021-03-07 13:31:53 +08:00
function prev_sibling () {
if ( $this -> parent === null ) {
return null ;
}
$idx = 0 ;
2021-02-26 15:31:59 +08:00
$count = count ( $this -> parent -> children );
2021-03-07 13:31:53 +08:00
while ( $idx < $count && $this !== $this -> parent -> children [ $idx ]) {
++ $idx ;
}
if ( -- $idx < 0 ) {
return null ;
}
2021-02-26 15:31:59 +08:00
return $this -> parent -> children [ $idx ];
}
// function to locate a specific ancestor tag in the path to the root.
2021-03-07 13:31:53 +08:00
function find_ancestor_tag ( $tag ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
// Start by including ourselves in the comparison.
$returnDom = $this ;
2021-03-07 13:31:53 +08:00
while ( ! is_null ( $returnDom )) {
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , " Current tag is: " . $returnDom -> tag );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( $returnDom -> tag == $tag ) {
2021-02-26 15:31:59 +08:00
break ;
}
$returnDom = $returnDom -> parent ;
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $returnDom ;
}
// get dom node's inner html
2021-03-07 13:31:53 +08:00
function innertext () {
if ( isset ( $this -> _ [ HDOM_INFO_INNER ])) {
return $this -> _ [ HDOM_INFO_INNER ];
}
if ( isset ( $this -> _ [ HDOM_INFO_TEXT ])) {
return $this -> dom -> restore_noise ( $this -> _ [ HDOM_INFO_TEXT ]);
}
2021-02-26 15:31:59 +08:00
$ret = '' ;
2021-03-07 13:31:53 +08:00
foreach ( $this -> nodes as $n ) {
2021-02-26 15:31:59 +08:00
$ret .= $n -> outertext ();
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
return $ret ;
}
// get dom node's outer text (with tag)
2021-03-07 13:31:53 +08:00
function outertext () {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
2021-02-26 15:31:59 +08:00
$text = '' ;
2021-03-07 13:31:53 +08:00
if ( $this -> tag == 'text' ) {
if ( ! empty ( $this -> text )) {
2021-02-26 15:31:59 +08:00
$text = " with text: " . $this -> text ;
}
}
$debugObject -> debugLog ( 1 , 'Innertext of tag: ' . $this -> tag . $text );
}
2021-03-07 13:31:53 +08:00
if ( $this -> tag === 'root' ) {
return $this -> innertext ();
}
2021-02-26 15:31:59 +08:00
// trigger callback
2021-03-07 13:31:53 +08:00
if ( $this -> dom && $this -> dom -> callback !== null ) {
2021-02-26 15:31:59 +08:00
call_user_func_array ( $this -> dom -> callback , array ( $this ));
}
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> _ [ HDOM_INFO_OUTER ])) {
return $this -> _ [ HDOM_INFO_OUTER ];
}
if ( isset ( $this -> _ [ HDOM_INFO_TEXT ])) {
return $this -> dom -> restore_noise ( $this -> _ [ HDOM_INFO_TEXT ]);
}
2021-02-26 15:31:59 +08:00
// render begin tag
2021-03-07 13:31:53 +08:00
if ( $this -> dom && $this -> dom -> nodes [ $this -> _ [ HDOM_INFO_BEGIN ]]) {
2021-02-26 15:31:59 +08:00
$ret = $this -> dom -> nodes [ $this -> _ [ HDOM_INFO_BEGIN ]] -> makeup ();
} else {
$ret = " " ;
}
// render inner text
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> _ [ HDOM_INFO_INNER ])) {
2021-02-26 15:31:59 +08:00
// If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
2021-03-07 13:31:53 +08:00
if ( $this -> tag != " br " ) {
2021-02-26 15:31:59 +08:00
$ret .= $this -> _ [ HDOM_INFO_INNER ];
}
} else {
2021-03-07 13:31:53 +08:00
if ( $this -> nodes ) {
foreach ( $this -> nodes as $n ) {
2021-02-26 15:31:59 +08:00
$ret .= $this -> convert_text ( $n -> outertext ());
}
}
}
// render end tag
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> _ [ HDOM_INFO_END ]) && $this -> _ [ HDOM_INFO_END ] != 0 ) {
$ret .= '</' . $this -> tag . '>' ;
}
2021-02-26 15:31:59 +08:00
return $ret ;
}
// get dom node's plain text
2021-03-07 13:31:53 +08:00
function text () {
if ( isset ( $this -> _ [ HDOM_INFO_INNER ])) {
return $this -> _ [ HDOM_INFO_INNER ];
}
switch ( $this -> nodetype ) {
case HDOM_TYPE_TEXT :
return $this -> dom -> restore_noise ( $this -> _ [ HDOM_INFO_TEXT ]);
case HDOM_TYPE_COMMENT :
return '' ;
case HDOM_TYPE_UNKNOWN :
return '' ;
}
if ( strcasecmp ( $this -> tag , 'script' ) === 0 ) {
return '' ;
}
if ( strcasecmp ( $this -> tag , 'style' ) === 0 ) {
return '' ;
2021-02-26 15:31:59 +08:00
}
$ret = '' ;
// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
// NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
// WHY is this happening?
2021-03-07 13:31:53 +08:00
if ( ! is_null ( $this -> nodes )) {
foreach ( $this -> nodes as $n ) {
2021-02-26 15:31:59 +08:00
$ret .= $this -> convert_text ( $n -> text ());
}
// If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
2021-03-07 13:31:53 +08:00
if ( $this -> tag == " span " ) {
2021-02-26 15:31:59 +08:00
$ret .= $this -> dom -> default_span_text ;
}
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $ret ;
}
2021-03-07 13:31:53 +08:00
function xmltext () {
2021-02-26 15:31:59 +08:00
$ret = $this -> innertext ();
$ret = str_ireplace ( '<![CDATA[' , '' , $ret );
$ret = str_replace ( ']]>' , '' , $ret );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $ret ;
}
// build node's text with tag
2021-03-07 13:31:53 +08:00
function makeup () {
2021-02-26 15:31:59 +08:00
// text, comment, unknown
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> _ [ HDOM_INFO_TEXT ])) {
return $this -> dom -> restore_noise ( $this -> _ [ HDOM_INFO_TEXT ]);
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
$ret = '<' . $this -> tag ;
$i = - 1 ;
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
foreach ( $this -> attr as $key => $val ) {
++ $i ;
2021-02-26 15:31:59 +08:00
// skip removed attribute
2021-03-07 13:31:53 +08:00
if ( $val === null || $val === false ) {
2021-02-26 15:31:59 +08:00
continue ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
$ret .= $this -> _ [ HDOM_INFO_SPACE ][ $i ][ 0 ];
//no value attr: nowrap, checked selected...
2021-03-07 13:31:53 +08:00
if ( $val === true ) {
2021-02-26 15:31:59 +08:00
$ret .= $key ;
2021-03-07 13:31:53 +08:00
} else {
switch ( $this -> _ [ HDOM_INFO_QUOTE ][ $i ]) {
case HDOM_QUOTE_DOUBLE :
$quote = '"' ;
break ;
case HDOM_QUOTE_SINGLE :
$quote = '\'' ;
break ;
default :
$quote = '' ;
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
$ret .= $key . $this -> _ [ HDOM_INFO_SPACE ][ $i ][ 1 ] . '=' . $this -> _ [ HDOM_INFO_SPACE ][ $i ][ 2 ] . $quote . $val . $quote ;
2021-02-26 15:31:59 +08:00
}
}
$ret = $this -> dom -> restore_noise ( $ret );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $ret . $this -> _ [ HDOM_INFO_ENDSPACE ] . '>' ;
}
// find elements by css selector
//PaperG - added ability for find to lowercase the value of the selector.
2021-03-07 13:31:53 +08:00
function find ( $selector , $idx = null , $lowercase = false ) {
2021-02-26 15:31:59 +08:00
$selectors = $this -> parse_selector ( $selector );
2021-03-07 13:31:53 +08:00
if (( $count = count ( $selectors )) === 0 ) {
return array ();
}
2021-02-26 15:31:59 +08:00
$found_keys = array ();
// find each selector
2021-03-07 13:31:53 +08:00
for ( $c = 0 ; $c < $count ; ++ $c ) {
2021-02-26 15:31:59 +08:00
// The change on the below line was documented on the sourceforge code tracker id 2788009
// used to be: if (($levle=count($selectors[0]))===0) return array();
2021-03-07 13:31:53 +08:00
if (( $levle = count ( $selectors [ $c ])) === 0 ) {
return array ();
}
if ( ! isset ( $this -> _ [ HDOM_INFO_BEGIN ])) {
return array ();
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
$head = array ( $this -> _ [ HDOM_INFO_BEGIN ] => 1 );
2021-02-26 15:31:59 +08:00
// handle descendant selectors, no recursive!
2021-03-07 13:31:53 +08:00
for ( $l = 0 ; $l < $levle ; ++ $l ) {
2021-02-26 15:31:59 +08:00
$ret = array ();
2021-03-07 13:31:53 +08:00
foreach ( $head as $k => $v ) {
$n = ( $k === - 1 ) ? $this -> dom -> root : $this -> dom -> nodes [ $k ];
2021-02-26 15:31:59 +08:00
//PaperG - Pass this optional parameter on to the seek function.
$n -> seek ( $selectors [ $c ][ $l ], $ret , $lowercase );
}
$head = $ret ;
}
2021-03-07 13:31:53 +08:00
foreach ( $head as $k => $v ) {
if ( ! isset ( $found_keys [ $k ])) {
2021-02-26 15:31:59 +08:00
$found_keys [ $k ] = 1 ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
}
}
// sort keys
ksort ( $found_keys );
$found = array ();
2021-03-07 13:31:53 +08:00
foreach ( $found_keys as $k => $v ) {
2021-02-26 15:31:59 +08:00
$found [] = $this -> dom -> nodes [ $k ];
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
// return nth-element or array
2021-03-07 13:31:53 +08:00
if ( is_null ( $idx )) {
return $found ;
} else if ( $idx < 0 ) {
$idx = count ( $found ) + $idx ;
}
2021-02-26 15:31:59 +08:00
return ( isset ( $found [ $idx ])) ? $found [ $idx ] : null ;
}
// seek for given conditions
// PaperG - added parameter to allow for case insensitive testing of the value of a selector.
2021-03-07 13:31:53 +08:00
protected function seek ( $selector , & $ret , $lowercase = false ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
list ( $tag , $key , $val , $exp , $no_key ) = $selector ;
// xpath index
2021-03-07 13:31:53 +08:00
if ( $tag && $key && is_numeric ( $key )) {
2021-02-26 15:31:59 +08:00
$count = 0 ;
2021-03-07 13:31:53 +08:00
foreach ( $this -> children as $c ) {
if ( $tag === '*' || $tag === $c -> tag ) {
if ( ++ $count == $key ) {
2021-02-26 15:31:59 +08:00
$ret [ $c -> _ [ HDOM_INFO_BEGIN ]] = 1 ;
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return ;
}
}
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return ;
}
2021-03-07 13:31:53 +08:00
$end = ( ! empty ( $this -> _ [ HDOM_INFO_END ])) ? $this -> _ [ HDOM_INFO_END ] : 0 ;
if ( $end == 0 ) {
2021-02-26 15:31:59 +08:00
$parent = $this -> parent ;
2021-03-07 13:31:53 +08:00
while ( ! isset ( $parent -> _ [ HDOM_INFO_END ]) && $parent !== null ) {
$end -= 1 ;
2021-02-26 15:31:59 +08:00
$parent = $parent -> parent ;
}
$end += $parent -> _ [ HDOM_INFO_END ];
}
2021-03-07 13:31:53 +08:00
for ( $i = $this -> _ [ HDOM_INFO_BEGIN ] + 1 ; $i < $end ; ++ $i ) {
2021-02-26 15:31:59 +08:00
$node = $this -> dom -> nodes [ $i ];
$pass = true ;
2021-03-07 13:31:53 +08:00
if ( $tag === '*' && ! $key ) {
if ( in_array ( $node , $this -> children , true )) {
2021-02-26 15:31:59 +08:00
$ret [ $i ] = 1 ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
continue ;
}
// compare tag
2021-03-07 13:31:53 +08:00
if ( $tag && $tag != $node -> tag && $tag !== '*' ) {
$pass = false ;
}
2021-02-26 15:31:59 +08:00
// compare key
if ( $pass && $key ) {
if ( $no_key ) {
2021-03-07 13:31:53 +08:00
if ( isset ( $node -> attr [ $key ])) {
$pass = false ;
}
2021-02-26 15:31:59 +08:00
} else {
2021-03-07 13:31:53 +08:00
if (( $key != " plaintext " ) && ! isset ( $node -> attr [ $key ])) {
$pass = false ;
}
2021-02-26 15:31:59 +08:00
}
}
// compare value
2021-03-07 13:31:53 +08:00
if ( $pass && $key && $val && $val !== '*' ) {
2021-02-26 15:31:59 +08:00
// If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
if ( $key == " plaintext " ) {
// $node->plaintext actually returns $node->text();
$nodeKeyValue = $node -> text ();
} else {
// this is a normal search, we want the value of that attribute of the tag.
$nodeKeyValue = $node -> attr [ $key ];
}
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , " testing node: " . $node -> tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue );
}
2021-02-26 15:31:59 +08:00
//PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
if ( $lowercase ) {
$check = $this -> match ( $exp , strtolower ( $val ), strtolower ( $nodeKeyValue ));
} else {
$check = $this -> match ( $exp , $val , $nodeKeyValue );
}
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , " after match: " . ( $check ? " true " : " false " ));
}
2021-02-26 15:31:59 +08:00
// handle multiple class
2021-03-07 13:31:53 +08:00
if ( ! $check && strcasecmp ( $key , 'class' ) === 0 ) {
foreach ( explode ( ' ' , $node -> attr [ $key ]) as $k ) {
2021-02-26 15:31:59 +08:00
// Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
2021-03-07 13:31:53 +08:00
if ( ! empty ( $k )) {
2021-02-26 15:31:59 +08:00
if ( $lowercase ) {
$check = $this -> match ( $exp , strtolower ( $val ), strtolower ( $k ));
} else {
$check = $this -> match ( $exp , $val , $k );
}
2021-03-07 13:31:53 +08:00
if ( $check ) {
break ;
}
2021-02-26 15:31:59 +08:00
}
}
}
2021-03-07 13:31:53 +08:00
if ( ! $check ) {
$pass = false ;
}
}
if ( $pass ) {
$ret [ $i ] = 1 ;
2021-02-26 15:31:59 +08:00
}
unset ( $node );
}
// It's passed by reference so this is actually what this function returns.
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 1 , " EXIT - ret: " , $ret );
}
2021-02-26 15:31:59 +08:00
}
protected function match ( $exp , $pattern , $value ) {
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
switch ( $exp ) {
case '=' :
2021-03-07 13:31:53 +08:00
return ( $value === $pattern );
2021-02-26 15:31:59 +08:00
case '!=' :
2021-03-07 13:31:53 +08:00
return ( $value !== $pattern );
2021-02-26 15:31:59 +08:00
case '^=' :
2021-03-07 13:31:53 +08:00
return preg_match ( " /^ " . preg_quote ( $pattern , '/' ) . " / " , $value );
2021-02-26 15:31:59 +08:00
case '$=' :
2021-03-07 13:31:53 +08:00
return preg_match ( " / " . preg_quote ( $pattern , '/' ) . " $ / " , $value );
2021-02-26 15:31:59 +08:00
case '*=' :
2021-03-07 13:31:53 +08:00
if ( $pattern [ 0 ] == '/' ) {
2021-02-26 15:31:59 +08:00
return preg_match ( $pattern , $value );
}
2021-03-07 13:31:53 +08:00
return preg_match ( " / " . $pattern . " /i " , $value );
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return false ;
}
protected function parse_selector ( $selector_string ) {
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
// pattern of CSS selectors, modified from mootools
// Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
// Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
// farther study is required to determine of this should be documented or removed.
// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
$pattern = " /([ \ w-: \ *]*)(?: \ #([ \ w-]+)| \ .([ \ w-]+))?(?: \ [@?(!?[ \ w-:]+)(?:([!*^ $ ]?=)[ \" ']?(.*?)[ \" ']?)? \ ])?([ \ /, ]+)/is " ;
2021-03-07 13:31:53 +08:00
preg_match_all ( $pattern , trim ( $selector_string ) . ' ' , $matches , PREG_SET_ORDER );
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , " Matches Array: " , $matches );
}
2021-02-26 15:31:59 +08:00
$selectors = array ();
2021-03-07 13:31:53 +08:00
$result = array ();
2021-02-26 15:31:59 +08:00
//print_r($matches);
foreach ( $matches as $m ) {
$m [ 0 ] = trim ( $m [ 0 ]);
2021-03-07 13:31:53 +08:00
if ( $m [ 0 ] === '' || $m [ 0 ] === '/' || $m [ 0 ] === '//' ) {
continue ;
}
2021-02-26 15:31:59 +08:00
// for browser generated xpath
2021-03-07 13:31:53 +08:00
if ( $m [ 1 ] === 'tbody' ) {
continue ;
}
2021-02-26 15:31:59 +08:00
list ( $tag , $key , $val , $exp , $no_key ) = array ( $m [ 1 ], null , null , '=' , false );
2021-03-07 13:31:53 +08:00
if ( ! empty ( $m [ 2 ])) {
$key = 'id' ;
$val = $m [ 2 ];
}
if ( ! empty ( $m [ 3 ])) {
$key = 'class' ;
$val = $m [ 3 ];
}
if ( ! empty ( $m [ 4 ])) {
$key = $m [ 4 ];
}
if ( ! empty ( $m [ 5 ])) {
$exp = $m [ 5 ];
}
if ( ! empty ( $m [ 6 ])) {
$val = $m [ 6 ];
}
2021-02-26 15:31:59 +08:00
// convert to lowercase
2021-03-07 13:31:53 +08:00
if ( $this -> dom -> lowercase ) {
$tag = strtolower ( $tag );
$key = strtolower ( $key );
}
2021-02-26 15:31:59 +08:00
//elements that do NOT have the specified attribute
2021-03-07 13:31:53 +08:00
if ( isset ( $key [ 0 ]) && $key [ 0 ] === '!' ) {
$key = substr ( $key , 1 );
$no_key = true ;
}
2021-02-26 15:31:59 +08:00
$result [] = array ( $tag , $key , $val , $exp , $no_key );
2021-03-07 13:31:53 +08:00
if ( trim ( $m [ 7 ]) === ',' ) {
2021-02-26 15:31:59 +08:00
$selectors [] = $result ;
2021-03-07 13:31:53 +08:00
$result = array ();
2021-02-26 15:31:59 +08:00
}
}
2021-03-07 13:31:53 +08:00
if ( count ( $result ) > 0 ) {
2021-02-26 15:31:59 +08:00
$selectors [] = $result ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
return $selectors ;
}
function __get ( $name ) {
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> attr [ $name ])) {
2021-02-26 15:31:59 +08:00
return $this -> convert_text ( $this -> attr [ $name ]);
}
switch ( $name ) {
2021-03-07 13:31:53 +08:00
case 'outertext' :
return $this -> outertext ();
case 'innertext' :
return $this -> innertext ();
case 'plaintext' :
return $this -> text ();
case 'xmltext' :
return $this -> xmltext ();
default :
return array_key_exists ( $name , $this -> attr );
2021-02-26 15:31:59 +08:00
}
}
function __set ( $name , $value ) {
switch ( $name ) {
2021-03-07 13:31:53 +08:00
case 'outertext' :
return $this -> _ [ HDOM_INFO_OUTER ] = $value ;
2021-02-26 15:31:59 +08:00
case 'innertext' :
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> _ [ HDOM_INFO_TEXT ])) {
return $this -> _ [ HDOM_INFO_TEXT ] = $value ;
}
2021-02-26 15:31:59 +08:00
return $this -> _ [ HDOM_INFO_INNER ] = $value ;
}
2021-03-07 13:31:53 +08:00
if ( ! isset ( $this -> attr [ $name ])) {
2021-02-26 15:31:59 +08:00
$this -> _ [ HDOM_INFO_SPACE ][] = array ( ' ' , '' , '' );
$this -> _ [ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_DOUBLE ;
}
$this -> attr [ $name ] = $value ;
}
function __isset ( $name ) {
switch ( $name ) {
2021-03-07 13:31:53 +08:00
case 'outertext' :
return true ;
case 'innertext' :
return true ;
case 'plaintext' :
return true ;
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
//no value attr: nowrap, checked selected...
return ( array_key_exists ( $name , $this -> attr )) ? true : isset ( $this -> attr [ $name ]);
}
function __unset ( $name ) {
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> attr [ $name ])) {
2021-02-26 15:31:59 +08:00
unset ( $this -> attr [ $name ]);
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
}
// PaperG - Function to convert the text from one character set to another if the two sets are not the same.
2021-03-07 13:31:53 +08:00
function convert_text ( $text ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
$converted_text = $text ;
$sourceCharset = " " ;
$targetCharset = " " ;
2021-03-07 13:31:53 +08:00
if ( $this -> dom ) {
2021-02-26 15:31:59 +08:00
$sourceCharset = strtoupper ( $this -> dom -> _charset );
$targetCharset = strtoupper ( $this -> dom -> _target_charset );
}
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 3 , " source charset: " . $sourceCharset . " target charaset: " . $targetCharset );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( ! empty ( $sourceCharset ) && ! empty ( $targetCharset ) && ( strcasecmp ( $sourceCharset , $targetCharset ) != 0 )) {
2021-02-26 15:31:59 +08:00
// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
2021-03-07 13:31:53 +08:00
if (( strcasecmp ( $targetCharset , 'UTF-8' ) == 0 ) && ( $this -> is_utf8 ( $text ))) {
2021-02-26 15:31:59 +08:00
$converted_text = $text ;
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
$converted_text = iconv ( $sourceCharset , $targetCharset , $text );
}
}
// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
2021-03-07 13:31:53 +08:00
if ( $targetCharset == 'UTF-8' ) {
if ( substr ( $converted_text , 0 , 3 ) == " \xef \xbb \xbf " ) {
2021-02-26 15:31:59 +08:00
$converted_text = substr ( $converted_text , 3 );
}
2021-03-07 13:31:53 +08:00
if ( substr ( $converted_text , - 3 ) == " \xef \xbb \xbf " ) {
$converted_text = substr ( $converted_text , 0 , - 3 );
2021-02-26 15:31:59 +08:00
}
}
return $converted_text ;
}
/**
2021-03-07 13:31:53 +08:00
* Returns true if $string is valid UTF - 8 and false otherwise .
*
* @ param mixed $str String to be tested
*
* @ return boolean
*/
static function is_utf8 ( $str ) {
$c = 0 ;
$b = 0 ;
$bits = 0 ;
$len = strlen ( $str );
for ( $i = 0 ; $i < $len ; $i ++ ) {
$c = ord ( $str [ $i ]);
if ( $c > 128 ) {
if (( $c >= 254 )) {
return false ;
} elseif ( $c >= 252 ) {
$bits = 6 ;
} elseif ( $c >= 248 ) {
$bits = 5 ;
} elseif ( $c >= 240 ) {
$bits = 4 ;
} elseif ( $c >= 224 ) {
$bits = 3 ;
} elseif ( $c >= 192 ) {
$bits = 2 ;
} else {
return false ;
}
if (( $i + $bits ) > $len ) {
return false ;
}
while ( $bits > 1 ) {
$i ++ ;
$b = ord ( $str [ $i ]);
if ( $b < 128 || $b > 191 ) {
return false ;
}
$bits -- ;
2021-02-26 15:31:59 +08:00
}
}
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return true ;
}
/*
function is_utf8 ( $string )
{
//this is buggy
return ( utf8_encode ( utf8_decode ( $string )) == $string );
}
*/
/**
* Function to try a few tricks to determine the displayed size of an img on the page .
* NOTE : This will ONLY work on an IMG tag . Returns FALSE on all other tag types .
*
* @ return array an array containing the 'height' and 'width' of the image on the page or - 1 if we can ' t figure it out .
2021-03-07 13:31:53 +08:00
* @ version April 19 2012
* @ author John Schlick
2021-02-26 15:31:59 +08:00
*/
2021-03-07 13:31:53 +08:00
function get_display_size () {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
$width = - 1 ;
$height = - 1 ;
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( $this -> tag !== 'img' ) {
2021-02-26 15:31:59 +08:00
return false ;
}
// See if there is aheight or width attribute in the tag itself.
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> attr [ 'width' ])) {
2021-02-26 15:31:59 +08:00
$width = $this -> attr [ 'width' ];
}
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> attr [ 'height' ])) {
2021-02-26 15:31:59 +08:00
$height = $this -> attr [ 'height' ];
}
// Now look for an inline style.
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> attr [ 'style' ])) {
2021-02-26 15:31:59 +08:00
// Thanks to user gnarf from stackoverflow for this regular expression.
$attributes = array ();
preg_match_all ( " /([ \ w-]+) \ s*: \ s*([^;]+) \ s*;?/ " , $this -> attr [ 'style' ], $matches , PREG_SET_ORDER );
foreach ( $matches as $match ) {
2021-03-07 13:31:53 +08:00
$attributes [ $match [ 1 ]] = $match [ 2 ];
2021-02-26 15:31:59 +08:00
}
// If there is a width in the style attributes:
2021-03-07 13:31:53 +08:00
if ( isset ( $attributes [ 'width' ]) && $width == - 1 ) {
2021-02-26 15:31:59 +08:00
// check that the last two characters are px (pixels)
2021-03-07 13:31:53 +08:00
if ( strtolower ( substr ( $attributes [ 'width' ], - 2 )) == 'px' ) {
$proposed_width = substr ( $attributes [ 'width' ], 0 , - 2 );
2021-02-26 15:31:59 +08:00
// Now make sure that it's an integer and not something stupid.
2021-03-07 13:31:53 +08:00
if ( filter_var ( $proposed_width , FILTER_VALIDATE_INT )) {
2021-02-26 15:31:59 +08:00
$width = $proposed_width ;
}
}
}
// If there is a width in the style attributes:
2021-03-07 13:31:53 +08:00
if ( isset ( $attributes [ 'height' ]) && $height == - 1 ) {
2021-02-26 15:31:59 +08:00
// check that the last two characters are px (pixels)
2021-03-07 13:31:53 +08:00
if ( strtolower ( substr ( $attributes [ 'height' ], - 2 )) == 'px' ) {
$proposed_height = substr ( $attributes [ 'height' ], 0 , - 2 );
2021-02-26 15:31:59 +08:00
// Now make sure that it's an integer and not something stupid.
2021-03-07 13:31:53 +08:00
if ( filter_var ( $proposed_height , FILTER_VALIDATE_INT )) {
2021-02-26 15:31:59 +08:00
$height = $proposed_height ;
}
}
}
}
// Future enhancement:
// Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
// Far future enhancement
// Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
// Note that in this case, the class or id will have the img subselector for it to apply to the image.
// ridiculously far future development
// If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
2021-03-07 13:31:53 +08:00
$result = array (
'height' => $height ,
'width' => $width
);
2021-02-26 15:31:59 +08:00
return $result ;
}
// camel naming conventions
2021-03-07 13:31:53 +08:00
function getAllAttributes () {
return $this -> attr ;
}
function getAttribute ( $name ) {
return $this -> __get ( $name );
}
function setAttribute ( $name , $value ) {
$this -> __set ( $name , $value );
}
function hasAttribute ( $name ) {
return $this -> __isset ( $name );
}
function removeAttribute ( $name ) {
$this -> __set ( $name , null );
}
function getElementById ( $id ) {
return $this -> find ( " # $id " , 0 );
}
function getElementsById ( $id , $idx = null ) {
return $this -> find ( " # $id " , $idx );
}
function getElementByTagName ( $name ) {
return $this -> find ( $name , 0 );
}
function getElementsByTagName ( $name , $idx = null ) {
return $this -> find ( $name , $idx );
}
function parentNode () {
return $this -> parent ();
}
function childNodes ( $idx = - 1 ) {
return $this -> children ( $idx );
}
function firstChild () {
return $this -> first_child ();
}
function lastChild () {
return $this -> last_child ();
}
function nextSibling () {
return $this -> next_sibling ();
}
function previousSibling () {
return $this -> prev_sibling ();
}
function hasChildNodes () {
return $this -> has_child ();
}
function nodeName () {
return $this -> tag ;
}
function appendChild ( $node ) {
$node -> parent ( $this );
return $node ;
}
2021-02-26 15:31:59 +08:00
}
/**
* simple html dom parser
* Paperg - in the find routine : allow us to specify that we want case insensitive testing of the value of the selector .
* Paperg - change $size from protected to public so we can easily access it
* Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not . Default is to NOT trust it .
*
* @ package PlaceLocalInclude
*/
2021-03-07 13:31:53 +08:00
class simple_html_dom {
2021-02-26 15:31:59 +08:00
public $root = null ;
public $nodes = array ();
public $callback = null ;
public $lowercase = false ;
// Used to keep track of how large the text was when we started.
public $original_size ;
public $size ;
protected $pos ;
protected $doc ;
protected $char ;
protected $cursor ;
protected $parent ;
protected $noise = array ();
protected $token_blank = " \t \r \n " ;
protected $token_equal = ' =/>' ;
protected $token_slash = " /> \r \n \t " ;
protected $token_attr = ' >' ;
// Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
public $_charset = '' ;
public $_target_charset = '' ;
protected $default_br_text = " " ;
public $default_span_text = " " ;
// use isset instead of in_array, performance boost about 30%...
2021-03-07 13:31:53 +08:00
protected $self_closing_tags = array ( 'img' => 1 ,
'br' => 1 ,
'input' => 1 ,
'meta' => 1 ,
'link' => 1 ,
'hr' => 1 ,
'base' => 1 ,
'embed' => 1 ,
'spacer' => 1
);
protected $block_tags = array ( 'root' => 1 , 'body' => 1 , 'form' => 1 , 'div' => 1 , 'span' => 1 , 'table' => 1 );
2021-02-26 15:31:59 +08:00
// Known sourceforge issue #2977341
// B tags that are not closed cause us to return everything to the end of the document.
protected $optional_closing_tags = array (
2021-03-07 13:31:53 +08:00
'tr' => array ( 'tr' => 1 , 'td' => 1 , 'th' => 1 ),
'th' => array ( 'th' => 1 ),
'td' => array ( 'td' => 1 ),
'li' => array ( 'li' => 1 ),
'dt' => array ( 'dt' => 1 , 'dd' => 1 ),
'dd' => array ( 'dd' => 1 , 'dt' => 1 ),
'dl' => array ( 'dd' => 1 , 'dt' => 1 ),
'p' => array ( 'p' => 1 ),
'nobr' => array ( 'nobr' => 1 ),
'b' => array ( 'b' => 1 ),
'option' => array ( 'option' => 1 ),
2021-02-26 15:31:59 +08:00
);
2021-03-07 13:31:53 +08:00
function __construct ( $str = null , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT , $defaultSpanText = DEFAULT_SPAN_TEXT ) {
if ( $str ) {
if ( preg_match ( " /^http: \ / \ //i " , $str ) || is_file ( $str )) {
2021-02-26 15:31:59 +08:00
$this -> load_file ( $str );
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
$this -> load ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
}
}
// Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
2021-03-07 13:31:53 +08:00
if ( ! $forceTagsClosed ) {
$this -> optional_closing_array = array ();
2021-02-26 15:31:59 +08:00
}
$this -> _target_charset = $target_charset ;
}
2021-03-07 13:31:53 +08:00
function __destruct () {
2021-02-26 15:31:59 +08:00
$this -> clear ();
}
// load html from string
2021-03-07 13:31:53 +08:00
function load ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT , $defaultSpanText = DEFAULT_SPAN_TEXT ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
// prepare
$this -> prepare ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
// strip out comments
$this -> remove_noise ( " '<!--(.*?)-->'is " );
// strip out cdata
$this -> remove_noise ( " '<! \ [CDATA \ [(.*?) \ ] \ ]>'is " , true );
// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
// Script tags removal now preceeds style tag removal.
// strip out <script> tags
$this -> remove_noise ( " '< \ s*script[^>]*[^/]>(.*?)< \ s*/ \ s*script \ s*>'is " );
$this -> remove_noise ( " '< \ s*script \ s*>(.*?)< \ s*/ \ s*script \ s*>'is " );
// strip out <style> tags
$this -> remove_noise ( " '< \ s*style[^>]*[^/]>(.*?)< \ s*/ \ s*style \ s*>'is " );
$this -> remove_noise ( " '< \ s*style \ s*>(.*?)< \ s*/ \ s*style \ s*>'is " );
// strip out preformatted tags
$this -> remove_noise ( " '< \ s*(?:code)[^>]*>(.*?)< \ s*/ \ s*(?:code) \ s*>'is " );
// strip out server side scripts
$this -> remove_noise ( " '(< \ ?)(.*?)( \ ?>)'s " , true );
// strip smarty scripts
$this -> remove_noise ( " '( \ { \ w)(.*?)( \ })'s " , true );
// parsing
2021-03-07 13:31:53 +08:00
while ( $this -> parse ()) {
;
}
2021-02-26 15:31:59 +08:00
// end
$this -> root -> _ [ HDOM_INFO_END ] = $this -> cursor ;
$this -> parse_charset ();
// make load function chainable
return $this ;
}
// load html from file
2021-03-07 13:31:53 +08:00
function load_file () {
2021-02-26 15:31:59 +08:00
$args = func_get_args ();
$this -> load ( call_user_func_array ( 'file_get_contents' , $args ), true );
// Throw an error if we can't properly load the dom.
2021-03-07 13:31:53 +08:00
if (( $error = error_get_last ()) !== null ) {
2021-02-26 15:31:59 +08:00
$this -> clear ();
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return false ;
}
}
// set callback function
2021-03-07 13:31:53 +08:00
function set_callback ( $function_name ) {
2021-02-26 15:31:59 +08:00
$this -> callback = $function_name ;
}
// remove callback function
2021-03-07 13:31:53 +08:00
function remove_callback () {
2021-02-26 15:31:59 +08:00
$this -> callback = null ;
}
// save dom as string
2021-03-07 13:31:53 +08:00
function save ( $filepath = '' ) {
2021-02-26 15:31:59 +08:00
$ret = $this -> root -> innertext ();
2021-03-07 13:31:53 +08:00
if ( $filepath !== '' ) {
file_put_contents ( $filepath , $ret , LOCK_EX );
}
2021-02-26 15:31:59 +08:00
return $ret ;
}
// find dom node by css selector
// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
2021-03-07 13:31:53 +08:00
function find ( $selector , $idx = null , $lowercase = false ) {
2021-02-26 15:31:59 +08:00
return $this -> root -> find ( $selector , $idx , $lowercase );
}
// clean up memory due to php5 circular references memory leak...
2021-03-07 13:31:53 +08:00
function clear () {
foreach ( $this -> nodes as $n ) {
$n -> clear ();
$n = null ;
}
2021-02-26 15:31:59 +08:00
// This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> children )) {
foreach ( $this -> children as $n ) {
$n -> clear ();
$n = null ;
}
}
if ( isset ( $this -> parent )) {
$this -> parent -> clear ();
unset ( $this -> parent );
}
if ( isset ( $this -> root )) {
$this -> root -> clear ();
unset ( $this -> root );
}
2021-02-26 15:31:59 +08:00
unset ( $this -> doc );
unset ( $this -> noise );
}
2021-03-07 13:31:53 +08:00
function dump ( $show_attr = true ) {
2021-02-26 15:31:59 +08:00
$this -> root -> dump ( $show_attr );
}
// prepare HTML data and init everything
2021-03-07 13:31:53 +08:00
protected function prepare ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT , $defaultSpanText = DEFAULT_SPAN_TEXT ) {
2021-02-26 15:31:59 +08:00
$this -> clear ();
// set the length of content before we do anything to it.
$this -> size = strlen ( $str );
// Save the original size of the html that we got in. It might be useful to someone.
$this -> original_size = $this -> size ;
//before we save the string as the doc... strip out the \r \n's if we are told to.
if ( $stripRN ) {
$str = str_replace ( " \r " , " " , $str );
$str = str_replace ( " \n " , " " , $str );
// set the length of content since we have changed it.
$this -> size = strlen ( $str );
}
2021-03-07 13:31:53 +08:00
$this -> doc = $str ;
$this -> pos = 0 ;
$this -> cursor = 1 ;
$this -> noise = array ();
$this -> nodes = array ();
$this -> lowercase = $lowercase ;
$this -> default_br_text = $defaultBRText ;
$this -> default_span_text = $defaultSpanText ;
$this -> root = new simple_html_dom_node ( $this );
$this -> root -> tag = 'root' ;
$this -> root -> _ [ HDOM_INFO_BEGIN ] = - 1 ;
$this -> root -> nodetype = HDOM_TYPE_ROOT ;
$this -> parent = $this -> root ;
if ( $this -> size > 0 ) {
$this -> char = $this -> doc [ 0 ];
}
2021-02-26 15:31:59 +08:00
}
// parse html content
2021-03-07 13:31:53 +08:00
protected function parse () {
if (( $s = $this -> copy_until_char ( '<' )) === '' ) {
2021-02-26 15:31:59 +08:00
return $this -> read_tag ();
}
// text
$node = new simple_html_dom_node ( $this );
2021-03-07 13:31:53 +08:00
++ $this -> cursor ;
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_TEXT ] = $s ;
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return true ;
}
// PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
// NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
// (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
2021-03-07 13:31:53 +08:00
protected function parse_charset () {
2021-02-26 15:31:59 +08:00
global $debugObject ;
$charset = null ;
2021-03-07 13:31:53 +08:00
if ( function_exists ( 'get_last_retrieve_url_contents_content_type' )) {
2021-02-26 15:31:59 +08:00
$contentTypeHeader = get_last_retrieve_url_contents_content_type ();
2021-03-07 13:31:53 +08:00
$success = preg_match ( '/charset=(.+)/' , $contentTypeHeader , $matches );
if ( $success ) {
2021-02-26 15:31:59 +08:00
$charset = $matches [ 1 ];
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'header content-type found charset of: ' . $charset );
}
2021-02-26 15:31:59 +08:00
}
}
2021-03-07 13:31:53 +08:00
if ( empty ( $charset )) {
$el = $this -> root -> find ( 'meta[http-equiv=Content-Type]' , 0 );
if ( ! empty ( $el )) {
2021-02-26 15:31:59 +08:00
$fullvalue = $el -> content ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'meta content-type tag found' . $fullvalue );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( ! empty ( $fullvalue )) {
2021-02-26 15:31:59 +08:00
$success = preg_match ( '/charset=(.+)/' , $fullvalue , $matches );
2021-03-07 13:31:53 +08:00
if ( $success ) {
2021-02-26 15:31:59 +08:00
$charset = $matches [ 1 ];
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
// If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' );
}
2021-02-26 15:31:59 +08:00
$charset = 'ISO-8859-1' ;
}
}
}
}
// If we couldn't find a charset above, then lets try to detect one based on the text we got...
2021-03-07 13:31:53 +08:00
if ( empty ( $charset )) {
2021-02-26 15:31:59 +08:00
// Have php try to detect the encoding from the text given to us.
2021-03-07 13:31:53 +08:00
$charset = mb_detect_encoding ( $this -> root -> plaintext . " ascii " , $encoding_list = array ( " UTF-8 " , " CP1252 " ));
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'mb_detect found: ' . $charset );
}
2021-02-26 15:31:59 +08:00
// and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
2021-03-07 13:31:53 +08:00
if ( $charset === false ) {
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'since mb_detect failed - using default of utf-8' );
}
2021-02-26 15:31:59 +08:00
$charset = 'UTF-8' ;
}
}
// Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
2021-03-07 13:31:53 +08:00
if (( strtolower ( $charset ) == strtolower ( 'ISO-8859-1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin-1' ))) {
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'replacing ' . $charset . ' with CP1252 as its a superset' );
}
2021-02-26 15:31:59 +08:00
$charset = 'CP1252' ;
}
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 1 , 'EXIT - ' . $charset );
}
2021-02-26 15:31:59 +08:00
return $this -> _charset = $charset ;
}
// read tag info
2021-03-07 13:31:53 +08:00
protected function read_tag () {
if ( $this -> char !== '<' ) {
2021-02-26 15:31:59 +08:00
$this -> root -> _ [ HDOM_INFO_END ] = $this -> cursor ;
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return false ;
}
$begin_tag_pos = $this -> pos ;
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
// end tag
2021-03-07 13:31:53 +08:00
if ( $this -> char === '/' ) {
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
// This represents the change in the simple_html_dom trunk from revision 180 to 181.
// $this->skip($this->token_blank_t);
$this -> skip ( $this -> token_blank );
$tag = $this -> copy_until_char ( '>' );
// skip attributes in end tag
2021-03-07 13:31:53 +08:00
if (( $pos = strpos ( $tag , ' ' )) !== false ) {
2021-02-26 15:31:59 +08:00
$tag = substr ( $tag , 0 , $pos );
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
$parent_lower = strtolower ( $this -> parent -> tag );
2021-03-07 13:31:53 +08:00
$tag_lower = strtolower ( $tag );
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( $parent_lower !== $tag_lower ) {
if ( isset ( $this -> optional_closing_tags [ $parent_lower ]) && isset ( $this -> block_tags [ $tag_lower ])) {
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = 0 ;
2021-03-07 13:31:53 +08:00
$org_parent = $this -> parent ;
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
while (( $this -> parent -> parent ) && strtolower ( $this -> parent -> tag ) !== $tag_lower ) {
2021-02-26 15:31:59 +08:00
$this -> parent = $this -> parent -> parent ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( strtolower ( $this -> parent -> tag ) !== $tag_lower ) {
2021-02-26 15:31:59 +08:00
$this -> parent = $org_parent ; // restore origonal parent
2021-03-07 13:31:53 +08:00
if ( $this -> parent -> parent ) {
$this -> parent = $this -> parent -> parent ;
}
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = $this -> cursor ;
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $this -> as_text_node ( $tag );
}
2021-03-07 13:31:53 +08:00
} else if (( $this -> parent -> parent ) && isset ( $this -> block_tags [ $tag_lower ])) {
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = 0 ;
2021-03-07 13:31:53 +08:00
$org_parent = $this -> parent ;
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
while (( $this -> parent -> parent ) && strtolower ( $this -> parent -> tag ) !== $tag_lower ) {
2021-02-26 15:31:59 +08:00
$this -> parent = $this -> parent -> parent ;
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( strtolower ( $this -> parent -> tag ) !== $tag_lower ) {
$this -> parent = $org_parent ; // restore origonal parent
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = $this -> cursor ;
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $this -> as_text_node ( $tag );
}
2021-03-07 13:31:53 +08:00
} else if (( $this -> parent -> parent ) && strtolower ( $this -> parent -> parent -> tag ) === $tag_lower ) {
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = 0 ;
2021-03-07 13:31:53 +08:00
$this -> parent = $this -> parent -> parent ;
} else {
2021-02-26 15:31:59 +08:00
return $this -> as_text_node ( $tag );
2021-03-07 13:31:53 +08:00
}
2021-02-26 15:31:59 +08:00
}
$this -> parent -> _ [ HDOM_INFO_END ] = $this -> cursor ;
2021-03-07 13:31:53 +08:00
if ( $this -> parent -> parent ) {
$this -> parent = $this -> parent -> parent ;
}
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
return true ;
}
2021-03-07 13:31:53 +08:00
$node = new simple_html_dom_node ( $this );
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_BEGIN ] = $this -> cursor ;
2021-03-07 13:31:53 +08:00
++ $this -> cursor ;
$tag = $this -> copy_until ( $this -> token_slash );
2021-02-26 15:31:59 +08:00
$node -> tag_start = $begin_tag_pos ;
// doctype, cdata & comments...
2021-03-07 13:31:53 +08:00
if ( isset ( $tag [ 0 ]) && $tag [ 0 ] === '!' ) {
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_TEXT ] = '<' . $tag . $this -> copy_until_char ( '>' );
2021-03-07 13:31:53 +08:00
if ( isset ( $tag [ 2 ]) && $tag [ 1 ] === '-' && $tag [ 2 ] === '-' ) {
2021-02-26 15:31:59 +08:00
$node -> nodetype = HDOM_TYPE_COMMENT ;
2021-03-07 13:31:53 +08:00
$node -> tag = 'comment' ;
2021-02-26 15:31:59 +08:00
} else {
$node -> nodetype = HDOM_TYPE_UNKNOWN ;
2021-03-07 13:31:53 +08:00
$node -> tag = 'unknown' ;
}
if ( $this -> char === '>' ) {
$node -> _ [ HDOM_INFO_TEXT ] .= '>' ;
2021-02-26 15:31:59 +08:00
}
$this -> link_nodes ( $node , true );
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
return true ;
}
// text
2021-03-07 13:31:53 +08:00
if ( $pos = strpos ( $tag , '<' ) !== false ) {
$tag = '<' . substr ( $tag , 0 , - 1 );
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_TEXT ] = $tag ;
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
$this -> char = $this -> doc [ -- $this -> pos ]; // prev
2021-02-26 15:31:59 +08:00
return true ;
}
2021-03-07 13:31:53 +08:00
if ( ! preg_match ( " /^[ \ w-:]+ $ / " , $tag )) {
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_TEXT ] = '<' . $tag . $this -> copy_until ( '<>' );
2021-03-07 13:31:53 +08:00
if ( $this -> char === '<' ) {
2021-02-26 15:31:59 +08:00
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return true ;
}
2021-03-07 13:31:53 +08:00
if ( $this -> char === '>' ) {
$node -> _ [ HDOM_INFO_TEXT ] .= '>' ;
}
2021-02-26 15:31:59 +08:00
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
return true ;
}
// begin tag
$node -> nodetype = HDOM_TYPE_ELEMENT ;
2021-03-07 13:31:53 +08:00
$tag_lower = strtolower ( $tag );
$node -> tag = ( $this -> lowercase ) ? $tag_lower : $tag ;
2021-02-26 15:31:59 +08:00
// handle optional closing tags
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> optional_closing_tags [ $tag_lower ])) {
while ( isset ( $this -> optional_closing_tags [ $tag_lower ][ strtolower ( $this -> parent -> tag )])) {
2021-02-26 15:31:59 +08:00
$this -> parent -> _ [ HDOM_INFO_END ] = 0 ;
2021-03-07 13:31:53 +08:00
$this -> parent = $this -> parent -> parent ;
2021-02-26 15:31:59 +08:00
}
$node -> parent = $this -> parent ;
}
$guard = 0 ; // prevent infinity loop
$space = array ( $this -> copy_skip ( $this -> token_blank ), '' , '' );
// attributes
2021-03-07 13:31:53 +08:00
do {
if ( $this -> char !== null && $space [ 0 ] === '' ) {
2021-02-26 15:31:59 +08:00
break ;
}
$name = $this -> copy_until ( $this -> token_equal );
2021-03-07 13:31:53 +08:00
if ( $guard === $this -> pos ) {
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
continue ;
}
$guard = $this -> pos ;
// handle endless '<'
2021-03-07 13:31:53 +08:00
if ( $this -> pos >= $this -> size - 1 && $this -> char !== '>' ) {
$node -> nodetype = HDOM_TYPE_TEXT ;
$node -> _ [ HDOM_INFO_END ] = 0 ;
$node -> _ [ HDOM_INFO_TEXT ] = '<' . $tag . $space [ 0 ] . $name ;
$node -> tag = 'text' ;
2021-02-26 15:31:59 +08:00
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return true ;
}
// handle mismatch '<'
2021-03-07 13:31:53 +08:00
if ( $this -> doc [ $this -> pos - 1 ] == '<' ) {
$node -> nodetype = HDOM_TYPE_TEXT ;
$node -> tag = 'text' ;
$node -> attr = array ();
$node -> _ [ HDOM_INFO_END ] = 0 ;
$node -> _ [ HDOM_INFO_TEXT ] = substr ( $this -> doc , $begin_tag_pos , $this -> pos - $begin_tag_pos - 1 );
$this -> pos -= 2 ;
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return true ;
}
2021-03-07 13:31:53 +08:00
if ( $name !== '/' && $name !== '' ) {
2021-02-26 15:31:59 +08:00
$space [ 1 ] = $this -> copy_skip ( $this -> token_blank );
2021-03-07 13:31:53 +08:00
$name = $this -> restore_noise ( $name );
if ( $this -> lowercase ) {
$name = strtolower ( $name );
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
if ( $this -> char === '=' ) {
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
$this -> parse_attr ( $node , $name , $space );
} else {
2021-02-26 15:31:59 +08:00
//no value attr: nowrap, checked selected...
$node -> _ [ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_NO ;
2021-03-07 13:31:53 +08:00
$node -> attr [ $name ] = true ;
if ( $this -> char != '>' ) {
$this -> char = $this -> doc [ -- $this -> pos ];
} // prev
2021-02-26 15:31:59 +08:00
}
$node -> _ [ HDOM_INFO_SPACE ][] = $space ;
2021-03-07 13:31:53 +08:00
$space = array ( $this -> copy_skip ( $this -> token_blank ), '' , '' );
} else {
2021-02-26 15:31:59 +08:00
break ;
2021-03-07 13:31:53 +08:00
}
} while ( $this -> char !== '>' && $this -> char !== '/' );
2021-02-26 15:31:59 +08:00
$this -> link_nodes ( $node , true );
$node -> _ [ HDOM_INFO_ENDSPACE ] = $space [ 0 ];
// check self closing
2021-03-07 13:31:53 +08:00
if ( $this -> copy_until_char_escape ( '>' ) === '/' ) {
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_ENDSPACE ] .= '/' ;
2021-03-07 13:31:53 +08:00
$node -> _ [ HDOM_INFO_END ] = 0 ;
} else {
2021-02-26 15:31:59 +08:00
// reset parent
2021-03-07 13:31:53 +08:00
if ( ! isset ( $this -> self_closing_tags [ strtolower ( $node -> tag )])) {
$this -> parent = $node ;
}
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
// If it's a BR tag, we need to set it's text to the default text.
// This way when we see it in plaintext, we can generate formatting that the user wants.
// since a br tag never has sub nodes, this works well.
2021-03-07 13:31:53 +08:00
if ( $node -> tag == " br " ) {
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_INNER ] = $this -> default_br_text ;
}
return true ;
}
// parse attributes
2021-03-07 13:31:53 +08:00
protected function parse_attr ( $node , $name , & $space ) {
2021-02-26 15:31:59 +08:00
// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
// If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
2021-03-07 13:31:53 +08:00
if ( isset ( $node -> attr [ $name ])) {
2021-02-26 15:31:59 +08:00
return ;
}
$space [ 2 ] = $this -> copy_skip ( $this -> token_blank );
switch ( $this -> char ) {
case '"' :
$node -> _ [ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_DOUBLE ;
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
$node -> attr [ $name ] = $this -> restore_noise ( $this -> copy_until_char_escape ( '"' ));
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
break ;
case '\'' :
$node -> _ [ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_SINGLE ;
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
$node -> attr [ $name ] = $this -> restore_noise ( $this -> copy_until_char_escape ( '\'' ));
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
break ;
default :
$node -> _ [ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_NO ;
2021-03-07 13:31:53 +08:00
$node -> attr [ $name ] = $this -> restore_noise ( $this -> copy_until ( $this -> token_attr ));
2021-02-26 15:31:59 +08:00
}
// PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
$node -> attr [ $name ] = str_replace ( " \r " , " " , $node -> attr [ $name ]);
$node -> attr [ $name ] = str_replace ( " \n " , " " , $node -> attr [ $name ]);
// PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
if ( $name == " class " ) {
$node -> attr [ $name ] = trim ( $node -> attr [ $name ]);
}
}
// link node's parent
2021-03-07 13:31:53 +08:00
protected function link_nodes ( & $node , $is_child ) {
$node -> parent = $this -> parent ;
2021-02-26 15:31:59 +08:00
$this -> parent -> nodes [] = $node ;
2021-03-07 13:31:53 +08:00
if ( $is_child ) {
2021-02-26 15:31:59 +08:00
$this -> parent -> children [] = $node ;
}
}
// as a text node
2021-03-07 13:31:53 +08:00
protected function as_text_node ( $tag ) {
2021-02-26 15:31:59 +08:00
$node = new simple_html_dom_node ( $this );
2021-03-07 13:31:53 +08:00
++ $this -> cursor ;
2021-02-26 15:31:59 +08:00
$node -> _ [ HDOM_INFO_TEXT ] = '</' . $tag . '>' ;
$this -> link_nodes ( $node , false );
2021-03-07 13:31:53 +08:00
$this -> char = ( ++ $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
return true ;
}
2021-03-07 13:31:53 +08:00
protected function skip ( $chars ) {
$this -> pos += strspn ( $this -> doc , $chars , $this -> pos );
$this -> char = ( $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
protected function copy_skip ( $chars ) {
$pos = $this -> pos ;
$len = strspn ( $this -> doc , $chars , $pos );
$this -> pos += $len ;
$this -> char = ( $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
if ( $len === 0 ) {
return '' ;
}
2021-02-26 15:31:59 +08:00
return substr ( $this -> doc , $pos , $len );
}
2021-03-07 13:31:53 +08:00
protected function copy_until ( $chars ) {
$pos = $this -> pos ;
$len = strcspn ( $this -> doc , $chars , $pos );
$this -> pos += $len ;
$this -> char = ( $this -> pos < $this -> size ) ? $this -> doc [ $this -> pos ] : null ; // next
2021-02-26 15:31:59 +08:00
return substr ( $this -> doc , $pos , $len );
}
2021-03-07 13:31:53 +08:00
protected function copy_until_char ( $char ) {
if ( $this -> char === null ) {
return '' ;
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if (( $pos = strpos ( $this -> doc , $char , $this -> pos )) === false ) {
$ret = substr ( $this -> doc , $this -> pos , $this -> size - $this -> pos );
2021-02-26 15:31:59 +08:00
$this -> char = null ;
2021-03-07 13:31:53 +08:00
$this -> pos = $this -> size ;
2021-02-26 15:31:59 +08:00
return $ret ;
}
2021-03-07 13:31:53 +08:00
if ( $pos === $this -> pos ) {
return '' ;
}
$pos_old = $this -> pos ;
2021-02-26 15:31:59 +08:00
$this -> char = $this -> doc [ $pos ];
2021-03-07 13:31:53 +08:00
$this -> pos = $pos ;
return substr ( $this -> doc , $pos_old , $pos - $pos_old );
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
protected function copy_until_char_escape ( $char ) {
if ( $this -> char === null ) {
return '' ;
}
2021-02-26 15:31:59 +08:00
$start = $this -> pos ;
2021-03-07 13:31:53 +08:00
while ( 1 ) {
if (( $pos = strpos ( $this -> doc , $char , $start )) === false ) {
$ret = substr ( $this -> doc , $this -> pos , $this -> size - $this -> pos );
2021-02-26 15:31:59 +08:00
$this -> char = null ;
2021-03-07 13:31:53 +08:00
$this -> pos = $this -> size ;
2021-02-26 15:31:59 +08:00
return $ret ;
}
2021-03-07 13:31:53 +08:00
if ( $pos === $this -> pos ) {
return '' ;
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
if ( $this -> doc [ $pos - 1 ] === '\\' ) {
$start = $pos + 1 ;
2021-02-26 15:31:59 +08:00
continue ;
}
2021-03-07 13:31:53 +08:00
$pos_old = $this -> pos ;
2021-02-26 15:31:59 +08:00
$this -> char = $this -> doc [ $pos ];
2021-03-07 13:31:53 +08:00
$this -> pos = $pos ;
return substr ( $this -> doc , $pos_old , $pos - $pos_old );
2021-02-26 15:31:59 +08:00
}
}
// remove noise from html content
// save the noise in the $this->noise array.
2021-03-07 13:31:53 +08:00
protected function remove_noise ( $pattern , $remove_tag = false ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
$count = preg_match_all ( $pattern , $this -> doc , $matches , PREG_SET_ORDER | PREG_OFFSET_CAPTURE );
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
for ( $i = $count - 1 ; $i > - 1 ; -- $i ) {
$key = '___noise___' . sprintf ( '% 5d' , count ( $this -> noise ) + 1000 );
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'key is: ' . $key );
}
$idx = ( $remove_tag ) ? 0 : 1 ;
2021-02-26 15:31:59 +08:00
$this -> noise [ $key ] = $matches [ $i ][ $idx ][ 0 ];
2021-03-07 13:31:53 +08:00
$this -> doc = substr_replace ( $this -> doc , $key , $matches [ $i ][ $idx ][ 1 ], strlen ( $matches [ $i ][ $idx ][ 0 ]));
2021-02-26 15:31:59 +08:00
}
// reset the length of content
$this -> size = strlen ( $this -> doc );
2021-03-07 13:31:53 +08:00
if ( $this -> size > 0 ) {
2021-02-26 15:31:59 +08:00
$this -> char = $this -> doc [ 0 ];
}
}
// restore noise to html content
2021-03-07 13:31:53 +08:00
function restore_noise ( $text ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
while (( $pos = strpos ( $text , '___noise___' )) !== false ) {
2021-02-26 15:31:59 +08:00
// Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
2021-03-07 13:31:53 +08:00
if ( strlen ( $text ) > $pos + 15 ) {
$key = '___noise___' . $text [ $pos + 11 ] . $text [ $pos + 12 ] . $text [ $pos + 13 ] . $text [ $pos + 14 ] . $text [ $pos + 15 ];
if ( is_object ( $debugObject )) {
$debugObject -> debugLog ( 2 , 'located key of: ' . $key );
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
if ( isset ( $this -> noise [ $key ])) {
$text = substr ( $text , 0 , $pos ) . $this -> noise [ $key ] . substr ( $text , $pos + 16 );
} else {
2021-02-26 15:31:59 +08:00
// do this to prevent an infinite loop.
2021-03-07 13:31:53 +08:00
$text = substr ( $text , 0 , $pos ) . 'UNDEFINED NOISE FOR KEY: ' . $key . substr ( $text , $pos + 16 );
2021-02-26 15:31:59 +08:00
}
2021-03-07 13:31:53 +08:00
} else {
2021-02-26 15:31:59 +08:00
// There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
2021-03-07 13:31:53 +08:00
$text = substr ( $text , 0 , $pos ) . 'NO NUMERIC NOISE KEY' . substr ( $text , $pos + 11 );
2021-02-26 15:31:59 +08:00
}
}
2021-03-07 13:31:53 +08:00
2021-02-26 15:31:59 +08:00
return $text ;
}
// Sometimes we NEED one of the noise elements.
2021-03-07 13:31:53 +08:00
function search_noise ( $text ) {
2021-02-26 15:31:59 +08:00
global $debugObject ;
2021-03-07 13:31:53 +08:00
if ( is_object ( $debugObject )) {
$debugObject -> debugLogEntry ( 1 );
}
2021-02-26 15:31:59 +08:00
2021-03-07 13:31:53 +08:00
foreach ( $this -> noise as $noiseElement ) {
if ( strpos ( $noiseElement , $text ) !== false ) {
2021-02-26 15:31:59 +08:00
return $noiseElement ;
}
}
}
2021-03-07 13:31:53 +08:00
function __toString () {
2021-02-26 15:31:59 +08:00
return $this -> root -> innertext ();
}
2021-03-07 13:31:53 +08:00
function __get ( $name ) {
switch ( $name ) {
2021-02-26 15:31:59 +08:00
case 'outertext' :
return $this -> root -> innertext ();
case 'innertext' :
return $this -> root -> innertext ();
case 'plaintext' :
return $this -> root -> text ();
case 'charset' :
return $this -> _charset ;
case 'target_charset' :
return $this -> _target_charset ;
}
}
// camel naming conventions
2021-03-07 13:31:53 +08:00
function childNodes ( $idx = - 1 ) {
return $this -> root -> childNodes ( $idx );
}
function firstChild () {
return $this -> root -> first_child ();
}
function lastChild () {
return $this -> root -> last_child ();
}
function createElement ( $name , $value = null ) {
return @ str_get_html ( " < $name > $value </ $name > " ) -> first_child ();
}
function createTextNode ( $value ) {
return @ end ( str_get_html ( $value ) -> nodes );
}
function getElementById ( $id ) {
return $this -> find ( " # $id " , 0 );
}
function getElementsById ( $id , $idx = null ) {
return $this -> find ( " # $id " , $idx );
}
function getElementByTagName ( $name ) {
return $this -> find ( $name , 0 );
}
function getElementsByTagName ( $name , $idx = - 1 ) {
return $this -> find ( $name , $idx );
}
function loadFile () {
$args = func_get_args ();
$this -> load_file ( $args );
}
2021-02-26 15:31:59 +08:00
}
?>