To fetch all images from html content:
preg_match_all(‘/]+>/i’,$content, $images_src);
&
$imgsrc_regex = ‘#<\s*img [^\>]*src\s*=\s*([“\’])(.*?)\1#im’;
Function to get attributes of img tag:
function img_plain($html,$tag=’src’) {
if (stripos($html, ‘<img’) !== false) {
$imgsrc_regex = ‘#<\s*img [^\>]*’.$tag.’\s*=\s*([“\’])(.*?)\1#im’;
preg_match($imgsrc_regex, $html, $matches);
unset($imgsrc_regex);
unset($html);
if (is_array($matches) && !empty($matches)) {
$img = $matches[2];
return $img;
} else {
return false;
}
} else {
return false;
}
}
Function to clean content from images:
function clean_content($html)
{
return preg_replace(“/<img[^>]+\>/i”, “”, $html);
}
Function to clean html tags from html content:
function remove_html($content)
{
return preg_replace(array(‘/<[^>]*>/’,’/{[^>]*}/’,’/<[^>]*>/’,’/<img[^>]+\>/i’), ”, ($content));
}
To fetch all paragraphs from html content:
$regex_pattern = “#<p[^>]*>(.*)</p>#isU”;
Only Alpha Numeric:
$string = preg_replace(‘/[^a-zA-Z0-9\s]/”, “‘, $string);
Function to clean html, CDATA and ignore iframe, img, p tags from html content: (V.1)
function clean_re_content($content, $default = true){
//pre($content);
if(stristr($content, ‘<iframe’) || stristr($content, ‘<img’))
return $content;
$content = str_replace(‘<p>’, ‘ppppp’, $content);
$content = str_replace(‘</p>’, ‘qqqqq’, $content);
$arr_remove = array(‘]]>’, ‘<![CDATA[‘, ‘éé’, ‘m²’, ‘<’, ‘>’);
$arr_replace = array(”, ”, ‘ee’, ‘m2’, ‘<‘, ‘>’);
//$content = html_entity_decode($content);
$content = html_entity_decode(preg_replace(‘~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i’, ‘$1’, $content), ENT_COMPAT, ‘UTF-8’);
$content = str_replace(‘<’, ‘<‘, $content);
$content = str_replace(‘>’, ‘>’, $content);
$content = str_replace($arr_remove, $arr_replace, $content);
$ret = preg_replace(array(‘/<[^>]*>/’,’/{[^>]*}/’,’/<[^>]*>/’,’/<img[^>]+\>/i’), ”, ($content));
if($ret == ”){
$ret = $content;
}
$ret = str_replace(‘ppppp’, ‘<p>’, $ret);
$ret = str_replace(‘qqqqq’, ‘</p>’, $ret);
return $ret;
}
Function to clean html, CDATA and ignore iframe, img, p tags from html content: (V.2)
function clean_re_content($content, $default = true){
//pre($content);
if(stristr($content, '<iframe') || stristr($content, '<img') || stristr($content, '<li>'))
return $content;
$content = str_replace('<p>', 'ppppp', $content);
$content = str_replace('</p>', 'qqqqq', $content);
$arr_remove = array(']]>', '<![CDATA[', 'éé', 'm²', '<', '>');
$arr_replace = array('', '', 'ee', 'm2', '<', '>');
$content = html_entity_decode(preg_replace('~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i', '$1', $content), ENT_COMPAT, 'UTF-8');
$regex = "~(?s)<title>.*?</title>~";
$content = preg_replace($regex,"",$content);
$content = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $content);
//$content = str_replace('style=', 'data-style=', $content);
if(stristr($content, '<li>')){
$content = str_replace('ppppp', '<p>', $content);
$content = str_replace('qqqqq', '</p>', $content);
return $content;
}
$content = str_replace('<', '<', $content);
$content = str_replace('>', '>', $content);
$content = str_replace($arr_remove, $arr_replace, $content);
$ret = $content;//preg_replace(array('/<[^>]*>/','/{[^>]*}/','/<[^>]*>/','/<img[^>]+\>/i'), '', ($content));
if($ret == ''){
$ret = $content;
}
$ret = str_replace('ppppp', '<p>', $ret);
$ret = str_replace('qqqqq', '</p>', $ret);
return $ret;
}
Last updated: September 7, 2015