php 자체적으로 제공하는 DOMDocument class 를 이용해서 html xml parsing 예제
페이지 정보
작성자 관리자 (112.♡.173.204) 작성일 21-05-10 13:04 조회 3,876 댓글 0본문
## 출처
https://codingreflections.com/php-parse-html/
## html xml 파일 로드
```
$dom = new DOMDocument();
//examples
//methods to load HTML
$dom->loadHTML($html_string);
$dom->loadHTMLFile('path/to/htmlfile.html');
//methods to load XML
$dom->load('path/to/xmlfile.xml');
$dom->loadXML($xml_string);
$documentElement = $dom->documentElement;
//object of DOMElement Class which gives access to the document element
```
## Id 로 선택
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$table = $dom->getElementById('tablepress-3'); //DOMElement
$child_elements = $table->getElementsByTagName('tr'); //DOMNodeList
$row_count = $child_elements->length - 1;
echo "No. of rows in the table is " . $row_count;
```
## TagName 으로 선택
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$h2s = $dom->getElementsByTagName('h2');
foreach( $h2s as $h2 ) {
echo $h2->textContent . "\n";
}
```
## XPath 를 이용
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$xpath = new DOMXpath($dom);
$tables = $xpath->query("//table[contains(@class,'tablepress')]");
$count = $tables->length;
echo "No. of tables " . $count;
```
## a tag 링크 추출
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$links = $dom->getElementsByTagName('a');
$urls = [];
foreach($links as $link) {
$url = $link->getAttribute('href');
$parsed_url = parse_url($url);
if( isset($parsed_url['host']) && $parsed_url['host'] === 'wordpress.org' ) {
$urls[] = $url;
}
}
var_dump($urls);
```
## 문서에 새 HTML 요소 삽입
```
$dom = new DomDocument();
@ $dom->loadHTML($html);
$ps = $dom->getElementsByTagName('p');
$first_para = $ps->item(0);
$html_to_add = '<div><a hreh="#"><img src="image.jpeg"/></a></div>';
$dom_to_add = new DOMDocument();
@ $dom_to_add->loadHTML($html_to_add);
$new_element = $dom_to_add->documentElement;
$imported_element = $dom->importNode($new_element, true);
$first_para->parentNode->insertBefore($imported_element, $first_para->nextSibling);
$output = @ $dom->saveHTML();
echo $output;
```
## 문서에서 요소 삭제
```
$html = '<p>This is our first paragraph</p>
<div class="del">Delete this</div>
<p>This is our second paragraph</p>
<p>This is our third paragraph</p>
<div class="del">Delete this too</div>';
$dom = new DomDocument();
@ $dom->loadHTML($html);
$documentElement = $dom->documentElement;
echo $dom->saveHTML();
$xpath = new DOMXpath($dom);
$elems = $xpath->query("//div[@class='del']");
foreach( $elems as $elem ) {
$elem->parentNode->removeChild($elem);
}
echo '<br><br>-------after deletion--------<br><br>';
echo $dom->saveHTML();
```
## 속성 조작
```
getAttribute($attribute_name) // get the value of an attribute
setAttribute($attribute_name, $attribute_value) – set the value of an attribute
hasAttribute($attribute_name) – checks whether an element has a certain attribute and returns a true or false
$html = '<span class="myclass" data-action="show">Content</span>';
$dom = new DomDocument();
@ $dom->loadHTML($html);
$elem = $dom->getElementsByTagName('span')->item(0);
if( $elem->hasAttribute('data-action') ) {
echo 'attribute value is "' . $elem->getAttribute('data-action') . '"';
$elem->setAttribute('data-action', 'hide');
echo '<br>updated attribute value is "' . $elem->getAttribute('data-action') . '"';
}
```
## 출처
https://codingreflections.com/php-parse-html/
https://stackoverflow.com/questions/14395239/class-domdocument-not-found
https://codingreflections.com/php-parse-html/
## html xml 파일 로드
```
$dom = new DOMDocument();
//examples
//methods to load HTML
$dom->loadHTML($html_string);
$dom->loadHTMLFile('path/to/htmlfile.html');
//methods to load XML
$dom->load('path/to/xmlfile.xml');
$dom->loadXML($xml_string);
$documentElement = $dom->documentElement;
//object of DOMElement Class which gives access to the document element
```
## Id 로 선택
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$table = $dom->getElementById('tablepress-3'); //DOMElement
$child_elements = $table->getElementsByTagName('tr'); //DOMNodeList
$row_count = $child_elements->length - 1;
echo "No. of rows in the table is " . $row_count;
```
## TagName 으로 선택
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$h2s = $dom->getElementsByTagName('h2');
foreach( $h2s as $h2 ) {
echo $h2->textContent . "\n";
}
```
## XPath 를 이용
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$xpath = new DOMXpath($dom);
$tables = $xpath->query("//table[contains(@class,'tablepress')]");
$count = $tables->length;
echo "No. of tables " . $count;
```
## a tag 링크 추출
```
$dom = new DomDocument();
@ $dom->loadHTML($res);
$links = $dom->getElementsByTagName('a');
$urls = [];
foreach($links as $link) {
$url = $link->getAttribute('href');
$parsed_url = parse_url($url);
if( isset($parsed_url['host']) && $parsed_url['host'] === 'wordpress.org' ) {
$urls[] = $url;
}
}
var_dump($urls);
```
## 문서에 새 HTML 요소 삽입
```
$dom = new DomDocument();
@ $dom->loadHTML($html);
$ps = $dom->getElementsByTagName('p');
$first_para = $ps->item(0);
$html_to_add = '<div><a hreh="#"><img src="image.jpeg"/></a></div>';
$dom_to_add = new DOMDocument();
@ $dom_to_add->loadHTML($html_to_add);
$new_element = $dom_to_add->documentElement;
$imported_element = $dom->importNode($new_element, true);
$first_para->parentNode->insertBefore($imported_element, $first_para->nextSibling);
$output = @ $dom->saveHTML();
echo $output;
```
## 문서에서 요소 삭제
```
$html = '<p>This is our first paragraph</p>
<div class="del">Delete this</div>
<p>This is our second paragraph</p>
<p>This is our third paragraph</p>
<div class="del">Delete this too</div>';
$dom = new DomDocument();
@ $dom->loadHTML($html);
$documentElement = $dom->documentElement;
echo $dom->saveHTML();
$xpath = new DOMXpath($dom);
$elems = $xpath->query("//div[@class='del']");
foreach( $elems as $elem ) {
$elem->parentNode->removeChild($elem);
}
echo '<br><br>-------after deletion--------<br><br>';
echo $dom->saveHTML();
```
## 속성 조작
```
getAttribute($attribute_name) // get the value of an attribute
setAttribute($attribute_name, $attribute_value) – set the value of an attribute
hasAttribute($attribute_name) – checks whether an element has a certain attribute and returns a true or false
$html = '<span class="myclass" data-action="show">Content</span>';
$dom = new DomDocument();
@ $dom->loadHTML($html);
$elem = $dom->getElementsByTagName('span')->item(0);
if( $elem->hasAttribute('data-action') ) {
echo 'attribute value is "' . $elem->getAttribute('data-action') . '"';
$elem->setAttribute('data-action', 'hide');
echo '<br>updated attribute value is "' . $elem->getAttribute('data-action') . '"';
}
```
## 출처
https://codingreflections.com/php-parse-html/
https://stackoverflow.com/questions/14395239/class-domdocument-not-found
추천0
댓글목록 0
등록된 댓글이 없습니다.