August 4, 2011

preg_match, preg_match_all, preg_replace

To fetch all images from html content:

preg_match_all(‘/]+>/i’,$content, $images_src);

&

$imgsrc_regex = ‘#<\s*img [^\>]*src\s*=\s*([“\’])(.*?)\1#im’;

Function to get attributes of img tag:

function img_plain($html,$tag=’src’) {
if (stripos($html, ‘<img’) !== false) {
$imgsrc_regex = ‘#<\s*img [^\>]*’.$tag.’\s*=\s*([“\’])(.*?)\1#im’;
preg_match($imgsrc_regex, $html, $matches);
unset($imgsrc_regex);
unset($html);
if (is_array($matches) && !empty($matches)) {

$img = $matches[2];

return $img;
} else {
return false;
}
} else {
return false;
}
}

Function to clean content from images:

function clean_content($html)
{
return preg_replace(“/<img[^>]+\>/i”, “”, $html);
}

Function to clean html tags from html content:

function remove_html($content)
{
return preg_replace(array(‘/<[^>]*>/’,’/{[^>]*}/’,’/&lt;[^>]*&gt;/’,’/<img[^>]+\>/i’), ”, ($content));
}

To fetch all paragraphs from html content:

$regex_pattern = “#<p[^>]*>(.*)</p>#isU”;

Only Alpha Numeric:

$string = preg_replace(‘/[^a-zA-Z0-9\s]/”, “‘, $string);
Function to clean html, CDATA and ignore iframe, img, p tags from html content: (V.1)

function clean_re_content($content, $default = true){
//pre($content);

if(stristr($content, ‘<iframe’) || stristr($content, ‘<img’))
return $content;

$content = str_replace(‘<p>’, ‘ppppp’, $content);
$content = str_replace(‘</p>’, ‘qqqqq’, $content);

$arr_remove = array(‘]]>’, ‘<![CDATA[‘, ‘éé’, ‘m²’, ‘&lt;’, ‘&gt;’);
$arr_replace = array(”, ”, ‘ee’, ‘m2’, ‘<‘, ‘>’);
//$content = html_entity_decode($content);
$content = html_entity_decode(preg_replace(‘~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i’, ‘$1’, $content), ENT_COMPAT, ‘UTF-8’);
$content = str_replace(‘&lt;’, ‘<‘, $content);
$content = str_replace(‘&gt;’, ‘>’, $content);
$content = str_replace($arr_remove, $arr_replace, $content);

$ret = preg_replace(array(‘/<[^>]*>/’,’/{[^>]*}/’,’/&lt;[^>]*&gt;/’,’/<img[^>]+\>/i’), ”, ($content));

if($ret == ”){
$ret = $content;
}

$ret = str_replace(‘ppppp’, ‘<p>’, $ret);
$ret = str_replace(‘qqqqq’, ‘</p>’, $ret);

return $ret;
}

Function to clean html, CDATA and ignore iframe, img, p tags from html content: (V.2)
function clean_re_content($content, $default = true){
//pre($content);
if(stristr($content, '<iframe') || stristr($content, '<img') || stristr($content, '<li>'))
return $content;
$content = str_replace('<p>', 'ppppp', $content);
$content = str_replace('</p>', 'qqqqq', $content);
$arr_remove = array(']]>', '<![CDATA[', 'éé', 'm²', '&lt;', '&gt;');
$arr_replace = array('', '', 'ee', 'm2', '<', '>');
$content = html_entity_decode(preg_replace('~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i', '$1', $content), ENT_COMPAT, 'UTF-8');
$regex = "~(?s)<title>.*?</title>~";
$content = preg_replace($regex,"",$content);
$content = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $content);
//$content = str_replace('style=', 'data-style=', $content);
if(stristr($content, '<li>')){
$content = str_replace('ppppp', '<p>', $content);
$content = str_replace('qqqqq', '</p>', $content);
return $content;
}
$content = str_replace('&lt;', '<', $content);
$content = str_replace('&gt;', '>', $content);
$content = str_replace($arr_remove, $arr_replace, $content);
$ret = $content;//preg_replace(array('/<[^>]*>/','/{[^>]*}/','/&lt;[^>]*&gt;/','/<img[^>]+\>/i'), '', ($content));
if($ret == ''){
$ret = $content;
}
$ret = str_replace('ppppp', '<p>', $ret);
$ret = str_replace('qqqqq', '</p>', $ret);
return $ret;
}

Last updated: September 7, 2015