Dom
extends simple_html_dom
in package
ExtendedSimpleHtmlDom
- extends simple_html_dom (Simple HTML DOM library)
- adds parentMatch() to search up the ancestor chain for selector/attribute/callback
- adds tidyCleanHtml() to repair & normalize HTML via ext-tidy if available or DOMDocument fallback
Notes:
- nodeMatches() supports basic selectors: tag, .class, #id, [attr], [attr=value], and simple combinations of them (e.g. div.class#id[attr=val]).
- This design favors performance and predictability for typical server-side HTML processing.
Table of Contents
Properties
- $_charset : mixed
- $_target_charset : mixed
- $callback : mixed
- $default_span_text : mixed
- $lowercase : mixed
- $nodes : mixed
- $original_size : mixed
- $root : mixed
- $size : mixed
- $block_tags : mixed
- $char : mixed
- $cursor : mixed
- $default_br_text : mixed
- $doc : mixed
- $noise : mixed
- $optional_closing_tags : mixed
- $parent : mixed
- $pos : mixed
- $self_closing_tags : mixed
- $token_attr : mixed
- $token_blank : mixed
- $token_equal : mixed
- $token_slash : mixed
Methods
- __construct() : mixed
- __destruct() : mixed
- __get() : mixed
- __toString() : mixed
- childNodes() : mixed
- clear() : mixed
- createElement() : mixed
- createTextNode() : mixed
- dump() : mixed
- find() : mixed
- firstChild() : mixed
- getElementById() : mixed
- getElementByTagName() : mixed
- getElementsById() : mixed
- getElementsByTagName() : mixed
- lastChild() : mixed
- load() : mixed
- load_file() : mixed
- loadFile() : mixed
- nodeMatches() : bool
- nodeMatches
- parentMatch() : simple_html_dom_node|null
- Flexible parentMatch:
- remove_callback() : mixed
- restore_noise() : mixed
- save() : mixed
- search_noise() : mixed
- set_callback() : mixed
- tidyCleanHtml() : string
- tidyCleanHtml
- as_text_node() : mixed
- copy_skip() : mixed
- copy_until() : mixed
- copy_until_char() : mixed
- link_nodes() : mixed
- parse() : mixed
- parse_attr() : mixed
- parse_charset() : mixed
- prepare() : mixed
- read_tag() : mixed
- remove_noise() : mixed
- skip() : mixed
- getNodeAttributeValue() : string|null
- getNodeAttributeValue
Properties
$_charset
public
mixed
$_charset
= ''
$_target_charset
public
mixed
$_target_charset
= ''
$callback
public
mixed
$callback
= \null
$default_span_text
public
mixed
$default_span_text
= ''
$lowercase
public
mixed
$lowercase
= \false
$nodes
public
mixed
$nodes
= array()
$original_size
public
mixed
$original_size
$root
public
mixed
$root
= \null
$size
public
mixed
$size
$block_tags
protected
mixed
$block_tags
= array('body' => 1, 'div' => 1, 'form' => 1, 'root' => 1, 'span' => 1, 'table' => 1)
$char
protected
mixed
$char
$cursor
protected
mixed
$cursor
$default_br_text
protected
mixed
$default_br_text
= ''
$doc
protected
mixed
$doc
$noise
protected
mixed
$noise
= array()
$optional_closing_tags
protected
mixed
$optional_closing_tags
= array(
// Not optional, see
// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
'b' => array('b' => 1),
'dd' => array('dd' => 1, 'dt' => 1),
// Not optional, see
// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
'dl' => array('dd' => 1, 'dt' => 1),
'dt' => array('dd' => 1, 'dt' => 1),
'li' => array('li' => 1),
'optgroup' => array('optgroup' => 1, 'option' => 1),
'option' => array('optgroup' => 1, 'option' => 1),
'p' => array('p' => 1),
'rp' => array('rp' => 1, 'rt' => 1),
'rt' => array('rp' => 1, 'rt' => 1),
'td' => array('td' => 1, 'th' => 1),
'th' => array('td' => 1, 'th' => 1),
'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
)
$parent
protected
mixed
$parent
$pos
protected
mixed
$pos
$self_closing_tags
protected
mixed
$self_closing_tags
= array('area' => 1, 'base' => 1, 'br' => 1, 'col' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1)
$token_attr
protected
mixed
$token_attr
= ' >'
$token_blank
protected
mixed
$token_blank
= " \t\r\n"
$token_equal
protected
mixed
$token_equal
= ' =/>'
$token_slash
protected
mixed
$token_slash
= " />\r\n\t"
Methods
__construct()
public
__construct([mixed $str = null ][, mixed $lowercase = true ][, mixed $forceTagsClosed = true ][, mixed $target_charset = DEFAULT_TARGET_CHARSET ][, mixed $stripRN = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ][, mixed $options = 0 ]) : mixed
Parameters
- $str : mixed = null
- $lowercase : mixed = true
- $forceTagsClosed : mixed = true
- $target_charset : mixed = DEFAULT_TARGET_CHARSET
- $stripRN : mixed = true
- $defaultBRText : mixed = DEFAULT_BR_TEXT
- $defaultSpanText : mixed = DEFAULT_SPAN_TEXT
- $options : mixed = 0
__destruct()
public
__destruct() : mixed
__get()
public
__get(mixed $name) : mixed
Parameters
- $name : mixed
__toString()
public
__toString() : mixed
childNodes()
public
childNodes([mixed $idx = -1 ]) : mixed
Parameters
- $idx : mixed = -1
clear()
public
clear() : mixed
createElement()
public
createElement(mixed $name[, mixed $value = null ]) : mixed
Parameters
- $name : mixed
- $value : mixed = null
createTextNode()
public
createTextNode(mixed $value) : mixed
Parameters
- $value : mixed
dump()
public
dump([mixed $show_attr = true ]) : mixed
Parameters
- $show_attr : mixed = true
find()
public
find(mixed $selector[, mixed $idx = null ][, mixed $lowercase = false ]) : mixed
Parameters
- $selector : mixed
- $idx : mixed = null
- $lowercase : mixed = false
firstChild()
public
firstChild() : mixed
getElementById()
public
getElementById(mixed $id) : mixed
Parameters
- $id : mixed
getElementByTagName()
public
getElementByTagName(mixed $name) : mixed
Parameters
- $name : mixed
getElementsById()
public
getElementsById(mixed $id[, mixed $idx = null ]) : mixed
Parameters
- $id : mixed
- $idx : mixed = null
getElementsByTagName()
public
getElementsByTagName(mixed $name[, mixed $idx = -1 ]) : mixed
Parameters
- $name : mixed
- $idx : mixed = -1
lastChild()
public
lastChild() : mixed
load()
public
load(mixed $str[, mixed $lowercase = true ][, mixed $stripRN = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ][, mixed $options = 0 ]) : mixed
Parameters
- $str : mixed
- $lowercase : mixed = true
- $stripRN : mixed = true
- $defaultBRText : mixed = DEFAULT_BR_TEXT
- $defaultSpanText : mixed = DEFAULT_SPAN_TEXT
- $options : mixed = 0
load_file()
public
load_file() : mixed
loadFile()
public
loadFile() : mixed
nodeMatches()
nodeMatches
public
nodeMatches(simple_html_dom_node $node, string $selector) : bool
Basic matcher for a node against a simple CSS selector: supports: tag, .class, #id, [attr], [attr=value], and combinations like div.class#id[attr=val]
Parameters
- $node : simple_html_dom_node
- $selector : string
Return values
boolparentMatch()
Flexible parentMatch:
public
parentMatch(simple_html_dom_node|string $startOrSelector[, string|callable|array<string|int, mixed>|null $matcher = null ][, int $startIndex = 0 ][, int|null $maxLevels = null ]) : simple_html_dom_node|null
Usage: // old usage (explicit node first) $parent = $this->parentMatch($node, '.parent');
// new usage (start selector, matcher) - similar to find() $parent = $this->parentMatch('#p1', '.parent'); // finds node '#p1' then searches up
// if second param (matcher) is omitted and first param is selector: $immediateParent = $this->parentMatch('#p1'); // returns direct parent of #p1
Parameters
- $startOrSelector : simple_html_dom_node|string
-
Node object or a selector string (find-style).
- $matcher : string|callable|array<string|int, mixed>|null = null
-
If $startOrSelector is node: same as before. If $startOrSelector is string: this is the ancestor matcher. If null (and $startOrSelector is string) returns immediate parent of start node.
- $startIndex : int = 0
-
Used only when $startOrSelector is a selector string - which matched node index to use.
- $maxLevels : int|null = null
-
maximum ancestor levels to scan (null = to root).
Return values
simple_html_dom_node|nullremove_callback()
public
remove_callback() : mixed
restore_noise()
public
restore_noise(mixed $text) : mixed
Parameters
- $text : mixed
save()
public
save([mixed $filepath = '' ]) : mixed
Parameters
- $filepath : mixed = ''
search_noise()
public
search_noise(mixed $text) : mixed
Parameters
- $text : mixed
set_callback()
public
set_callback(mixed $function_name) : mixed
Parameters
- $function_name : mixed
tidyCleanHtml()
tidyCleanHtml
public
tidyCleanHtml(string $html[, array<string, mixed> $options = [] ]) : string
Repair and clean HTML using ext-tidy if available; fallback to DOMDocument normalization.
Parameters
- $html : string
- $options : array<string, mixed> = []
-
Tidy options (if using ext-tidy)
Return values
string —Cleaned HTML string
as_text_node()
protected
as_text_node(mixed $tag) : mixed
Parameters
- $tag : mixed
copy_skip()
protected
copy_skip(mixed $chars) : mixed
Parameters
- $chars : mixed
copy_until()
protected
copy_until(mixed $chars) : mixed
Parameters
- $chars : mixed
copy_until_char()
protected
copy_until_char(mixed $char) : mixed
Parameters
- $char : mixed
link_nodes()
protected
link_nodes(mixed &$node, mixed $is_child) : mixed
Parameters
- $node : mixed
- $is_child : mixed
parse()
protected
parse() : mixed
parse_attr()
protected
parse_attr(mixed $node, mixed $name, mixed &$space) : mixed
Parameters
- $node : mixed
- $name : mixed
- $space : mixed
parse_charset()
protected
parse_charset() : mixed
prepare()
protected
prepare(mixed $str[, mixed $lowercase = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ]) : mixed
Parameters
- $str : mixed
- $lowercase : mixed = true
- $defaultBRText : mixed = DEFAULT_BR_TEXT
- $defaultSpanText : mixed = DEFAULT_SPAN_TEXT
read_tag()
protected
read_tag() : mixed
remove_noise()
protected
remove_noise(mixed $pattern[, mixed $remove_tag = false ]) : mixed
Parameters
- $pattern : mixed
- $remove_tag : mixed = false
skip()
protected
skip(mixed $chars) : mixed
Parameters
- $chars : mixed
getNodeAttributeValue()
getNodeAttributeValue
private
getNodeAttributeValue(simple_html_dom_node $node, string $name) : string|null
Retrieve attribute value from node in a resilient way (works with various simplehtmldom versions).
Parameters
- $node : simple_html_dom_node
- $name : string