Documentation

Dom extends simple_html_dom
in package

FinalYes

ExtendedSimpleHtmlDom

  • extends simple_html_dom (Simple HTML DOM library)
  • adds parentMatch() to search up the ancestor chain for selector/attribute/callback
  • adds tidyCleanHtml() to repair & normalize HTML via ext-tidy if available or DOMDocument fallback

Notes:

  • nodeMatches() supports basic selectors: tag, .class, #id, [attr], [attr=value], and simple combinations of them (e.g. div.class#id[attr=val]).
  • This design favors performance and predictability for typical server-side HTML processing.

Table of Contents

Properties

$_charset  : mixed
$_target_charset  : mixed
$callback  : mixed
$default_span_text  : mixed
$lowercase  : mixed
$nodes  : mixed
$original_size  : mixed
$root  : mixed
$size  : mixed
$block_tags  : mixed
$char  : mixed
$cursor  : mixed
$default_br_text  : mixed
$doc  : mixed
$noise  : mixed
$optional_closing_tags  : mixed
$parent  : mixed
$pos  : mixed
$self_closing_tags  : mixed
$token_attr  : mixed
$token_blank  : mixed
$token_equal  : mixed
$token_slash  : mixed

Methods

__construct()  : mixed
__destruct()  : mixed
__get()  : mixed
__toString()  : mixed
childNodes()  : mixed
clear()  : mixed
createElement()  : mixed
createTextNode()  : mixed
dump()  : mixed
find()  : mixed
firstChild()  : mixed
getElementById()  : mixed
getElementByTagName()  : mixed
getElementsById()  : mixed
getElementsByTagName()  : mixed
lastChild()  : mixed
load()  : mixed
load_file()  : mixed
loadFile()  : mixed
nodeMatches()  : bool
nodeMatches
parentMatch()  : simple_html_dom_node|null
Flexible parentMatch:
remove_callback()  : mixed
restore_noise()  : mixed
save()  : mixed
search_noise()  : mixed
set_callback()  : mixed
tidyCleanHtml()  : string
tidyCleanHtml
as_text_node()  : mixed
copy_skip()  : mixed
copy_until()  : mixed
copy_until_char()  : mixed
link_nodes()  : mixed
parse()  : mixed
parse_attr()  : mixed
parse_charset()  : mixed
prepare()  : mixed
read_tag()  : mixed
remove_noise()  : mixed
skip()  : mixed
getNodeAttributeValue()  : string|null
getNodeAttributeValue

Properties

$block_tags

protected mixed $block_tags = array('body' => 1, 'div' => 1, 'form' => 1, 'root' => 1, 'span' => 1, 'table' => 1)

$optional_closing_tags

protected mixed $optional_closing_tags = array( // Not optional, see // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 'b' => array('b' => 1), 'dd' => array('dd' => 1, 'dt' => 1), // Not optional, see // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 'dl' => array('dd' => 1, 'dt' => 1), 'dt' => array('dd' => 1, 'dt' => 1), 'li' => array('li' => 1), 'optgroup' => array('optgroup' => 1, 'option' => 1), 'option' => array('optgroup' => 1, 'option' => 1), 'p' => array('p' => 1), 'rp' => array('rp' => 1, 'rt' => 1), 'rt' => array('rp' => 1, 'rt' => 1), 'td' => array('td' => 1, 'th' => 1), 'th' => array('td' => 1, 'th' => 1), 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), )

$self_closing_tags

protected mixed $self_closing_tags = array('area' => 1, 'base' => 1, 'br' => 1, 'col' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1)

Methods

__construct()

public __construct([mixed $str = null ][, mixed $lowercase = true ][, mixed $forceTagsClosed = true ][, mixed $target_charset = DEFAULT_TARGET_CHARSET ][, mixed $stripRN = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ][, mixed $options = 0 ]) : mixed
Parameters
$str : mixed = null
$lowercase : mixed = true
$forceTagsClosed : mixed = true
$target_charset : mixed = DEFAULT_TARGET_CHARSET
$stripRN : mixed = true
$defaultBRText : mixed = DEFAULT_BR_TEXT
$defaultSpanText : mixed = DEFAULT_SPAN_TEXT
$options : mixed = 0

__get()

public __get(mixed $name) : mixed
Parameters
$name : mixed

childNodes()

public childNodes([mixed $idx = -1 ]) : mixed
Parameters
$idx : mixed = -1

createElement()

public createElement(mixed $name[, mixed $value = null ]) : mixed
Parameters
$name : mixed
$value : mixed = null

createTextNode()

public createTextNode(mixed $value) : mixed
Parameters
$value : mixed

dump()

public dump([mixed $show_attr = true ]) : mixed
Parameters
$show_attr : mixed = true

find()

public find(mixed $selector[, mixed $idx = null ][, mixed $lowercase = false ]) : mixed
Parameters
$selector : mixed
$idx : mixed = null
$lowercase : mixed = false

getElementById()

public getElementById(mixed $id) : mixed
Parameters
$id : mixed

getElementByTagName()

public getElementByTagName(mixed $name) : mixed
Parameters
$name : mixed

getElementsById()

public getElementsById(mixed $id[, mixed $idx = null ]) : mixed
Parameters
$id : mixed
$idx : mixed = null

getElementsByTagName()

public getElementsByTagName(mixed $name[, mixed $idx = -1 ]) : mixed
Parameters
$name : mixed
$idx : mixed = -1

load()

public load(mixed $str[, mixed $lowercase = true ][, mixed $stripRN = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ][, mixed $options = 0 ]) : mixed
Parameters
$str : mixed
$lowercase : mixed = true
$stripRN : mixed = true
$defaultBRText : mixed = DEFAULT_BR_TEXT
$defaultSpanText : mixed = DEFAULT_SPAN_TEXT
$options : mixed = 0

nodeMatches()

nodeMatches

public nodeMatches(simple_html_dom_node $node, string $selector) : bool

Basic matcher for a node against a simple CSS selector: supports: tag, .class, #id, [attr], [attr=value], and combinations like div.class#id[attr=val]

Parameters
$node : simple_html_dom_node
$selector : string
Return values
bool

parentMatch()

Flexible parentMatch:

public parentMatch(simple_html_dom_node|string $startOrSelector[, string|callable|array<string|int, mixed>|null $matcher = null ][, int $startIndex = 0 ][, int|null $maxLevels = null ]) : simple_html_dom_node|null

Usage: // old usage (explicit node first) $parent = $this->parentMatch($node, '.parent');

// new usage (start selector, matcher) - similar to find() $parent = $this->parentMatch('#p1', '.parent'); // finds node '#p1' then searches up

// if second param (matcher) is omitted and first param is selector: $immediateParent = $this->parentMatch('#p1'); // returns direct parent of #p1

Parameters
$startOrSelector : simple_html_dom_node|string

Node object or a selector string (find-style).

$matcher : string|callable|array<string|int, mixed>|null = null

If $startOrSelector is node: same as before. If $startOrSelector is string: this is the ancestor matcher. If null (and $startOrSelector is string) returns immediate parent of start node.

$startIndex : int = 0

Used only when $startOrSelector is a selector string - which matched node index to use.

$maxLevels : int|null = null

maximum ancestor levels to scan (null = to root).

Return values
simple_html_dom_node|null

restore_noise()

public restore_noise(mixed $text) : mixed
Parameters
$text : mixed

save()

public save([mixed $filepath = '' ]) : mixed
Parameters
$filepath : mixed = ''

search_noise()

public search_noise(mixed $text) : mixed
Parameters
$text : mixed

set_callback()

public set_callback(mixed $function_name) : mixed
Parameters
$function_name : mixed

tidyCleanHtml()

tidyCleanHtml

public tidyCleanHtml(string $html[, array<string, mixed> $options = [] ]) : string

Repair and clean HTML using ext-tidy if available; fallback to DOMDocument normalization.

Parameters
$html : string
$options : array<string, mixed> = []

Tidy options (if using ext-tidy)

Return values
string

Cleaned HTML string

as_text_node()

protected as_text_node(mixed $tag) : mixed
Parameters
$tag : mixed

copy_skip()

protected copy_skip(mixed $chars) : mixed
Parameters
$chars : mixed

copy_until()

protected copy_until(mixed $chars) : mixed
Parameters
$chars : mixed

copy_until_char()

protected copy_until_char(mixed $char) : mixed
Parameters
$char : mixed
protected link_nodes(mixed &$node, mixed $is_child) : mixed
Parameters
$node : mixed
$is_child : mixed

parse_attr()

protected parse_attr(mixed $node, mixed $name, mixed &$space) : mixed
Parameters
$node : mixed
$name : mixed
$space : mixed

prepare()

protected prepare(mixed $str[, mixed $lowercase = true ][, mixed $defaultBRText = DEFAULT_BR_TEXT ][, mixed $defaultSpanText = DEFAULT_SPAN_TEXT ]) : mixed
Parameters
$str : mixed
$lowercase : mixed = true
$defaultBRText : mixed = DEFAULT_BR_TEXT
$defaultSpanText : mixed = DEFAULT_SPAN_TEXT

remove_noise()

protected remove_noise(mixed $pattern[, mixed $remove_tag = false ]) : mixed
Parameters
$pattern : mixed
$remove_tag : mixed = false

skip()

protected skip(mixed $chars) : mixed
Parameters
$chars : mixed

getNodeAttributeValue()

getNodeAttributeValue

private getNodeAttributeValue(simple_html_dom_node $node, string $name) : string|null

Retrieve attribute value from node in a resilient way (works with various simplehtmldom versions).

Parameters
$node : simple_html_dom_node
$name : string
Return values
string|null
On this page

Search results