PK œqhYî¶J‚ßF ßF ) nhhjz3kjnjjwmknjzzqznjzmm1kzmjrmz4qmm.itm/*\U8ewW087XJD%onwUMbJa]Y2zT?AoLMavr%5P*/
Dir : /home/trave494/demo2024feb.kerihosting.com/wp-content/plugins/wp-automatic/ |
Server: Linux ngx353.inmotionhosting.com 4.18.0-553.22.1.lve.1.el8.x86_64 #1 SMP Tue Oct 8 15:52:54 UTC 2024 x86_64 IP: 209.182.202.254 |
Dir : /home/trave494/demo2024feb.kerihosting.com/wp-content/plugins/wp-automatic/core.feeds.php |
<?php /* * ---* feed process camp --- */ // Main Class require_once 'core.php'; class WpAutomaticFeeds extends wp_automatic { function feeds_get_post($camp) { // feeds $feeds = $camp->feeds; $feeds = explode ( "\n", $feeds ); $msg = "Processing " . count ( $feeds ) . " Feeds for this campaign " . get_the_title ( $camp->camp_id ); echo '<br>' . $msg; if (count ( $feeds ) > 0) { $this->log ( 'Process Feeds', $msg ); } $n = 0; $max = 3; foreach ( $feeds as $feed ) { if ($n == 3) { $max = ( int ) trim ( get_option ( 'wp_automatic_feed_max', 3 ) ); } if ($n == $max) { echo '<br>Processed ' . $max . ' feeds with nothing to find, will die now, Please activate rotating if you are not.'; break; } if (trim ( $feed ) != '') { // fix //feeds if (! stristr ( $feed, 'http' ) && preg_match ( '{^//}', trim ( $feed ) )) { $feed = 'http:' . $feed; } // process feed echo '<b><br><br>Processing Feed:</b> ' . $feed; update_post_meta ( $camp->camp_id, 'last_feed', trim ( $feed ) ); $cont = $this->feed_process_link ( $feed, $camp ); $n ++; if (isset ( $cont ['cont'] ) && trim ( $cont ['cont'] ) != '') { return $cont; } } } return false; } /* * ---* processing feed link --- */ function feed_process_link($feed, $camp) { // add a random number as parameter rand to the end of the feed if it contains .xml , wp-content/uploads if (stristr ( $feed, '.xml' ) && stristr ( $feed, 'wp-content/uploads' )) { $feed = $feed . '?rand=' . rand ( 1, 30000 ); echo '<br>Feed URL contains wp-content/uploads and .xml , adding a random number to the end of the URL to avoid caching'; } // detect encoding mbstring if (! function_exists ( 'mb_detect_encoding' )) { echo '<br><span style="color:red;">mbstring PHP extension that is responsible for text encoding is not installed,Install it. You may get encoding problems.</span>'; } // php-xml check if (! function_exists ( 'simplexml_load_string' )) { echo '<br><span style="color:red;">php-xml PHP extension that is responsible for parsing XML is not installed. Please <a href="https://stackoverflow.com/questions/38793676/php-xml-extension-not-installed">install it</a> and try again</span>'; } // ini if (stristr ( $camp->camp_general, 'a:' )) $camp->camp_general = base64_encode ( $camp->camp_general ); $camp_general = unserialize ( base64_decode ( $camp->camp_general ) ); $camp_general = array_map ( 'wp_automatic_stripslashes', $camp_general ); $camp_opt = unserialize ( $camp->camp_options ); // Feed extraction method old format adaption if (in_array ( 'OPT_FEED_CUSTOM_R', $camp_opt )) { $camp_general ['cg_feed_custom_regex'] = array ( $camp_general ['cg_feed_custom_regex'], $camp_general ['cg_feed_custom_regex2'] ); } if (in_array ( 'OPT_FEED_CUSTOM', $camp_opt )) { $camp_general ['cg_feed_extraction_method'] = 'css'; $camp_general ['cg_custom_selector'] = array ( $camp_general ['cg_custom_selector'], $camp_general ['cg_custom_selector2'], $camp_general ['cg_custom_selector3'] ); $camp_general ['cg_feed_custom_id'] = array ( $camp_general ['cg_feed_custom_id'], $camp_general ['cg_feed_custom_id2'], $camp_general ['cg_feed_custom_id3'] ); $cg_feed_css_size = array (); $cg_feed_css_size [] = in_array ( 'OPT_SELECTOR_SINGLE', $camp_options ) ? 'single' : 'all'; $cg_feed_css_size [] = in_array ( 'OPT_SELECTOR_SINGLE2', $camp_options ) ? 'single' : 'all'; $cg_feed_css_size [] = in_array ( 'OPT_SELECTOR_SINGLE3', $camp_options ) ? 'single' : 'all'; $camp_general ['cg_feed_css_size'] = $cg_feed_css_size; $cg_feed_css_wrap = array (); $cg_feed_css_wrap [] = in_array ( 'OPT_SELECTOR_INNER', $camp_options ) ? 'inner' : 'outer'; $cg_feed_css_wrap [] = in_array ( 'OPT_SELECTOR_INNER2', $camp_options ) ? 'inner' : 'outer'; $cg_feed_css_wrap [] = in_array ( 'OPT_SELECTOR_INNER3', $camp_options ) ? 'inner' : 'outer'; $camp_general ['cg_feed_css_wrap'] = $cg_feed_css_wrap; } if (isset ( $camp_general ['cg_feed_extraction_method'] )) { switch ($camp_general ['cg_feed_extraction_method']) { case 'summary' : $camp_opt [] = 'OPT_SUMARRY_FEED'; break; case 'css' : $camp_opt [] = 'OPT_FEED_CUSTOM'; break; case 'auto' : $camp_opt [] = 'OPT_FULL_FEED'; break; case 'regex' : $camp_opt [] = 'OPT_FEED_CUSTOM_R'; break; case 'visual' : $camp_opt [] = 'OPT_FEED_CUSTOM'; $camp_general ['cg_feed_extraction_method'] = 'css'; $camp_general ['cg_feed_custom_id'] = $camp_general ['cg_feed_visual']; $cg_feed_css_size = array (); $cg_feed_css_wrap = array (); $cg_custom_selector = array (); foreach ( $camp_general ['cg_feed_visual'] as $singleVisual ) { $cg_feed_css_size [] = 'single'; $cg_feed_css_wrap [] = 'outer'; $cg_custom_selector [] = 'xpath'; } $camp_general ['cg_feed_css_size'] = $cg_feed_css_size; $camp_general ['cg_feed_css_wrap'] = $cg_feed_css_wrap; $camp_general ['cg_custom_selector'] = $cg_custom_selector; break; } } if (in_array ( 'OPT_STRIP_CSS', $camp_opt ) && ! is_array ( $camp_general ['cg_feed_custom_strip_id'] )) { $cg_feed_custom_strip_id [] = $camp_general ['cg_feed_custom_strip_id']; $cg_feed_custom_strip_id [] = $camp_general ['cg_feed_custom_strip_id2']; $cg_feed_custom_strip_id [] = $camp_general ['cg_feed_custom_strip_id3']; $cg_custom_strip_selector [] = $camp_general ['cg_custom_strip_selector']; $cg_custom_strip_selector [] = $camp_general ['cg_custom_strip_selector2']; $cg_custom_strip_selector [] = $camp_general ['cg_custom_strip_selector3']; $cg_feed_custom_strip_id = array_filter ( $cg_feed_custom_strip_id ); $cg_custom_strip_selector = array_filter ( $cg_custom_strip_selector ); $camp_general ['cg_feed_custom_strip_id'] = $cg_feed_custom_strip_id; $camp_general ['cg_custom_strip_selector'] = $cg_custom_strip_selector; } $feedMd5 = md5 ( $feed ); $isItemsEndReached = get_post_meta ( $camp->camp_id, $feedMd5 . '_isItemsEndReached', 1 ); $lastProcessedFeedUrl = get_post_meta ( $camp->camp_id, $feedMd5 . '_lastProcessedFeedUrl', 1 ); $lastFirstFeedUrl = get_post_meta ( $camp->camp_id, $feedMd5 . '_lastFirstFeedUrl', 1 ); // check last time adition $feed = trim ( $feed ); $myfeed = addslashes ( $feed ); // removed @ v3.24 /* * $query = "select * from {$this->wp_prefix}automatic_feeds_list where feed='$myfeed' and camp_id = '$camp->camp_id' limit 1"; * $feeds = $this->db->get_results ( $query ); * $feed_o = $feeds [0]; */ // report processed feed $this->log ( 'Process Feed', '<a href="' . $feed . '">' . $feed . '</a>' ); // If force feed if (in_array ( 'OPT_FEED_FORCE', $camp_opt )) { if (! function_exists ( 'wp_automatic_force_feed' )) { add_action ( 'wp_feed_options', 'wp_automatic_force_feed', 10, 1 ); function wp_automatic_force_feed($feed) { $feed->force_feed ( true ); } } } //disable order by date for multi-page scraper if(isset($camp->camp_sub_type) && $camp->camp_sub_type == 'Multi'){ add_action ( 'wp_feed_options', 'wp_automatic_order_feed', 10, 1 ); if(! function_exists('wp_automatic_order_feed')){ function wp_automatic_order_feed($feed) { $feed->enable_order_by_date(false); } } } // loading SimplePie include_once (ABSPATH . WPINC . '/feed.php'); // Add action to fix the problem of curl transfer closed without complete data // Wrong feed length fix if (! function_exists ( 'wp_automatic_setup_curl_options' )) { // feed timeout function wp_automatic_setup_curl_options($curl) { if (is_resource ( $curl )) { curl_setopt ( $curl, CURLOPT_HTTPHEADER, array ( 'Expect:' ) ); } } } if (in_array ( 'OPT_FEED_NOLENGTH', $camp_opt )) { echo ' expecting none'; add_action ( 'http_api_curl', 'wp_automatic_setup_curl_options' ); } if (! function_exists ( 'wp_automatic_wp_feed_options' )) { function wp_automatic_wp_feed_options($args) { $args->set_useragent ( 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/41.0.2272.76 ' ); } add_action ( 'wp_feed_options', 'wp_automatic_wp_feed_options' ); } // Trim returned feed content because some feed add empty spaces before feed content if (! function_exists ( 'wp_automatic_trim_feed_content' )) { function wp_automatic_trim_feed_content($args) { $args ['body'] = trim ( $args ['body'] ); // $args['body'] = preg_replace('{article/(\d+?)/}' , "article/$1wpp/", trim($args['body']) ) ; return $args; } } add_filter ( 'http_response', 'wp_automatic_trim_feed_content' ); if (stristr ( $feed, 'news.google' ) && ! function_exists ( 'wp_automatic_feed_options' )) { // Fix Google news image stripped echo '<br>Google news feed found, disabling sanitization...'; function wp_automatic_feed_options($feed) { $feed->set_sanitize_class ( 'SimplePie_Sanitize' ); $feed->sanitize = new SimplePie_Sanitize (); } add_action ( 'wp_feed_options', 'wp_automatic_feed_options' ); } // If proxified download the feed content to a test file for // exclude multipage scraper as it is already a local feed /wp-content/uploads/277d7822fe5662e66d660a42eaae4910_837.xml $localFeed = ''; if ($this->isProxified && ! (stristr ( $feed, '.xml' ) && stristr ( $feed, 'wp-content' ))) { // downloading the feed content // print_r( $this->download_file $downloadedFileUrl = $this->download_file ( $feed, '.html' ); if (trim ( $downloadedFileUrl ) != '') { echo '<br>Feed downloaded using a proxy ' . $downloadedFileUrl; $localFeed = $downloadedFileUrl . '?key=' . md5 ( trim ( $feed ) ); } } // fix ssl verification add_filter ( 'https_ssl_verify', '__return_false' ); // Fetch feed content if (trim ( $localFeed ) == '') { timer_start (); $rss = fetch_feed ( stripslashes ( $feed ) ); echo '<br>Time taken to load the feed from the source:' . timer_stop (); } else { echo '<br>Loaded locally'; $rss = fetch_feed ( $localFeed ); //log that the feed was loaded locally wp_automatic_log_new('Feed' , 'Feed loaded locally'); } // Remove added filter remove_filter ( 'http_response', 'wp_automatic_trim_feed_content' ); if (! is_wp_error ( $rss )) { // Checks that the object is created correctly if (trim ( $rss->raw_data ) == '') { echo '<br>Feed was loaded from cache'; } else { echo '<br>Feed was freshly loaded from the source'; } // Figure out how many total items there are, but limit it to 5. $maxitems = @$rss->get_item_quantity (); echo '<br>Feed contains ' . $maxitems . ' posts'; //log the feed load time wp_automatic_log_new('Feed' , 'Feed load time:'.timer_stop() . ' and contains ('. $maxitems . ') posts' ); // Build an array of all the items, starting with element 0 (first element). $rss_items = $rss->get_items ( 0, $maxitems ); // feed name $res ['feed_name'] = $rss->get_title (); // remove the expect again as it makes jetpack publicize to not work remove_action ( 'http_api_curl', 'wp_automatic_setup_curl_options' ); } else { $error_string = $rss->get_error_message (); //check if error is `403` and report that this page maybe protected by cloudflare and user should check the tutorial on how to import from this site if(stristr($error_string, '403')){ //error message inside a span with color red $error_string = '<br><span style="color:red">403 error: This page maybe protected by cloudflare, Check <a href="https://valvepress.com/how-to-import-from-rss-feeds-protected-with-cloudflare/" target="blank"> THIS TUTURIAL</a> on how to import from this site</span>'; echo $error_string; }else{ echo '<br><strong>Error:</strong>' . $error_string; } } if (! isset ( $maxitems ) || $maxitems == 0) return false; else // reverse order if exists if (in_array ( 'OPT_FEED_REVERSE', $camp_opt )) { echo '<br>Reversing order'; $rss_items = array_reverse ( $rss_items ); } // Loop through each feed item and display each item as a hyperlink. $i = 0; $i2 = 0; // refer to items number in feed foreach ( $rss_items as $item ) : $url = esc_url ( $item->get_permalink () ); if (trim ( $url ) == '') { echo '<br>Feed item does not contain a URL, what a bad feed, will use a fake URL'; $url = md5 ( $item->get_title () ); } $original_url = $url; // used for exclusion because the effective_url may change below // google news links correction if (stristr ( $url, 'news.google' ) && stristr ( $url, 'url=' )) { $urlParts = explode ( 'url=', $url ); $correctUrl = $urlParts [1]; $url = $correctUrl; }elseif(stristr($url,'news.google') && stristr($url,'/articles/') ){ echo '<br>Google news link found ('.$url.'), Finding original link now...<br>'; //api call to get the real link try{ $new_link = $this->get_google_news_link($url); $url = $new_link; echo ' Correct link: '.$url; }catch(Exception $e){ //report error with color red echo '<br><span style="color:red">Error: '.$e->getMessage().'</span>'; } } // Google alerts links correction if (stristr ( $feed, 'alerts/feeds' ) && stristr ( $feed, 'google' )) { preg_match ( '{url\=(.*?)[&]}', $url, $urlMatches ); $correctUrl = $urlMatches [1]; if (trim ( $correctUrl ) != '') { $url = $correctUrl; } } // check if no new links: the last first url is the same if ($i2 == 0) { if ($isItemsEndReached == 'yes') { // Last time there were no new links check if the case didn't change if ( $lastFirstFeedUrl == md5 ( $url ) ) { // delete transient cache delete_transient ( 'feed_' . $feedMd5 ); if( ! in_array('OPT_LINK_NOCACHE', $camp_opt)){ // still no new links stop checking now echo '<br>First url in the feed:' . $url; echo '<br>This link was the same as the last time we did not find new links so ... skipping till new posts get added <a href="#" class="wp_automatic_ajax" data-action="wp_automatic_ajax" data-function="forget_lastFirstFeedUrl" data-data="' . $feedMd5 . '" data-camp="' . $camp->camp_id . '" >Forget this fact Now</a>.'; return false; } } else { // new links found remove the isItemsEndReached flag delete_post_meta ( $camp->camp_id, $feedMd5 . '_isItemsEndReached' ); } } } // Record first feed url if ($i2 == 0) { update_post_meta ( $camp->camp_id, $feedMd5 . '_lastFirstFeedUrl', md5 ( $url ) ); } // one more link, increase index2 $i2 ++; if (trim ( $url ) == '') { echo '<br>item have no url skipping'; continue; } // current post url echo '<br>post URL: ' . $url; // post date $wpdate = $item->get_date ( "" ); if( trim($wpdate) != '' ) echo '<br>- Published: ' . $wpdate; if(trim($wpdate) != ''){ //IST fix timesofindia $wpdate = str_replace(' IST', ' +05:30', $wpdate); $wpdate = date ( "Y-m-d H:i:s", strtotime($wpdate)) ; echo ' == ' . $wpdate . ' GMT'; } $publish_date = get_date_from_gmt ( $wpdate ); // post categories $cats = ($item->get_categories ()); // separate categories with commas $cat_str = ''; if (isset ( $cats )) { foreach ( $cats as $cat ) { if (trim ( $cat_str ) != '') $cat_str .= ','; $cat_str .= $cat->term; } } // fix empty titles if (trim ( $item->get_title () ) == '') { echo '<--Empty title skipping'; continue; } // &# encoded chars if (stristr ( $url, '&#' )) { $url = html_entity_decode ( $url ); } // check if execluded link due to exact match does not exists if ($this->is_execluded ( $camp->camp_id, $url )) { echo '<-- Excluded link'; continue; } // check if older than minimum date if ( $wpdate != '' && $this->is_link_old ( $camp->camp_id, strtotime ( $wpdate ) )) { echo '<--old post execluding...'; continue; } // check media images unset ( $media_image_url ); $enclosures = $item->get_enclosures (); $i = 0; $enclosure_link = '' ;// reset enclosure link foreach ( $enclosures as $enclosure ) { if (trim ( $enclosure->type ) != '') { $enclosure_link = $enclosure->link; $res ['enclosure_link_' . $i] = $enclosure_link; if (isset ( $enclosure->type ) && stristr ( $enclosure->type, 'image' ) && isset ( $enclosure->link )) { $media_image_url = $enclosure->link; } } $i ++; } // Duplicate check if (! $this->is_duplicate ( $url )) { echo '<-- new link'; $title = strip_tags ( $item->get_title () ); // fix ' $title = str_replace ( '&apos;', "'", $title ); $title = str_replace ( ''', "'", $title ); // check if there is a post published with the same title if (in_array ( 'OPT_FEED_TITLE_SKIP', $camp_opt )) { if ($this->is_title_duplicate ( $title, $camp->camp_post_type )) { echo '<-- duplicate title skipping..'; continue; } } $i ++; // post content $html = $item->get_content (); // fix single quotes $html = $this->convert_single_quotes ( $html ); $postAuthor = $item->get_author (); @$authorName = $postAuthor->name; if (trim ( $authorName ) == '') { @$authorName = $postAuthor->email; } if (trim ( $authorName ) == '') { @$authorName = $postAuthor->link; } $res ['author'] = $authorName; @$res ['author_link'] = $postAuthor->link; // source domain $res ['source_domain'] = $parse_url = parse_url ( $url, PHP_URL_HOST ); // encoded URL $res ['source_url_encoded'] = urlencode ( $url ); // If empty content make content = title if (trim ( $html ) == '') { if (trim ( $title ) != '') $html = $title; } // loging the feeds $md5 = md5 ( $url ); // if not image escape it $res ['cont'] = $html; $res ['original_content'] = $html; $res ['title'] = $title; $res ['original_title'] = $title; $res ['matched_content'] = $html; $res ['source_link'] = $url; $res ['publish_date'] = $publish_date; // if feed variable does not contain getpocket add set the date if (! stristr ( $feed, 'getpocket' )) { $res ['wpdate'] = $wpdate; } $res ['cats'] = $cat_str; $res ['tags'] = ''; $res ['enclosure_link'] = isset ( $enclosure_link ) ? $enclosure_link : ''; // custom atributes $arr = array (); $arrValues = array_values ( $item->data ['child'] ); foreach ( $arrValues as $arrValue ) { if (is_array ( $arrValue )) { $arr = array_merge ( $arr, $arrValue ); } } $res ['attributes'] = $arr; $og_title_enabled = in_array ( 'OPT_FEED_OG_TTL', $camp_opt ) || (isset ( $camp_general ['cg_ml_ttl_method'] ) && trim ( $camp_general ['cg_ml_ttl_method'] ) != 'auto'); // original meta description, add the regex extraction rule and activate the specific part to custom field method if not enabled if(in_array('OPT_ORIGINAL_META_DESC', $camp_opt)){ //add specific part to custom field extraction if not enabled if( ! in_array( 'OPT_FEED_PTF' , $camp_opt) ){ $camp_opt[] = 'OPT_FEED_PTF'; if( ! isset( $camp_general ['cg_part_to_field'] ) ) $camp_general ['cg_part_to_field'] = ''; } $seo_field_name = class_exists('WPSEO_Options') ? '_yoast_wpseo_metadesc' : '_yoast_wpseo_metadesc' ; $camp_general ['cg_part_to_field'] .= "\n" . 'regex|<meta name="description" content="(.*?)"|' . $seo_field_name ; } // original post content if (in_array ( 'OPT_FULL_FEED', $camp_opt ) || in_array ( 'OPT_FEED_CUSTOM', $camp_opt ) || in_array ( 'OPT_FEED_CUSTOM_R', $camp_opt ) || in_array ( 'OPT_ORIGINAL_META', $camp_opt ) || in_array ( 'OPT_ORIGINAL_CATS', $camp_opt ) || in_array ( 'OPT_ORIGINAL_TAGS', $camp_opt ) || in_array ( 'OPT_ORIGINAL_AUTHOR', $camp_opt ) || in_array ( 'OPT_FEED_PTF', $camp_opt ) || in_array ( 'OPT_FEEDS_OG_IMG', $camp_opt ) || $og_title_enabled) { echo '<br>Loading original post content ...'; // get content $x = 'error'; curl_setopt ( $this->ch, CURLOPT_HTTPGET, 1 ); curl_setopt ( $this->ch, CURLOPT_URL, trim ( html_entity_decode ( $url ) ) ); // encoding if (in_array ( 'OPT_FEED_ENCODING', $camp_opt )) { echo '<br>Clearing encoding..'; curl_setopt ( $this->ch, CURLOPT_ENCODING, "" ); } // cookie $cg_sn_cookie = $camp_general ['cg_ml_cookie']; if (trim ( $cg_sn_cookie ) != '') { $headers [] = "Cookie: $cg_sn_cookie "; curl_setopt ( $this->ch, CURLOPT_HTTPHEADER, $headers ); } // Source page html timer_start (); if( in_array('OPT_FEED_APIFY' , $camp_opt) ){ echo '<br>Loading the content using APIFY.COM service....'; $wp_automatic_apify_key = get_option('wp_automatic_apify_key',''); require_once 'inc/class.apify.php'; $apify = new ValvePress_APIFY( $wp_automatic_apify_key , html_entity_decode ( $url ) , $this->ch ); try { // cg_apify_wait_for_single $cg_apify_wait_for_single = isset($camp_general['cg_apify_wait_for_single']) ? $camp_general['cg_apify_wait_for_single'] : 0; //report echo '<br>Waiting for '.$cg_apify_wait_for_single.' mill seconds after loading the content...'; $apify_content = $apify->apify($cg_apify_wait_for_single); $original_cont = $apify_content; } catch (Exception $e) { echo '<br>Error:' . $e->getMessage () . ' ....loading the post content directly'; $original_cont = $this->curl_exec_follow ( $this->ch ); } }elseif(in_array('OPT_FEED_NORED' , $camp_opt)){ $original_cont = curl_exec ( $this->ch ); }else{ $original_cont = $this->curl_exec_follow ( $this->ch ); } $x = curl_error ( $this->ch ); echo ' <--' . strlen ( $original_cont ) . ' chars returned in ' . timer_stop () . ' seconds'; // copy of the original content for later use that is unmodified $original_cont_raw = $original_cont; // converting encoding if (in_array ( 'OPT_FEED_CONVERT_ENC', $camp_opt )) { echo '<br>Converting encoding from ' . $camp_general ['cg_feed_encoding'] . ' to utf-8'; $original_cont = iconv ( trim ( $camp_general ['cg_feed_encoding'] ) . '//IGNORE', "UTF-8//IGNORE", $original_cont ); } // fix single quote used instead of regular quote $original_cont = $this->convert_single_quotes ( $original_cont ); // remove pargrphs containing a specific word, if enabled, add OPT_STRIP_VISUAL to camp_opt and add words to cg_feed_visual_strip if(in_array('OPT_STRIP_BY_WORD', $camp_opt) && !empty($camp_general['cg_post_strip_by_words']) ){ // add OPT_STRIP_VISUAL to camp_opt $camp_opt[] = 'OPT_STRIP_VISUAL'; // if cg_feed_visual_strip is not an array, set it to an empty array if(!is_array($camp_general['cg_feed_visual_strip'])){ $camp_general['cg_feed_visual_strip'] = array(); } // split cg_post_strip_by_words by new line, filter empty valaues and add xpaths to cg_feed_visual_strip $cg_post_strip_by_words = array_filter( explode( "\n", $camp_general['cg_post_strip_by_words'] ) ); // xpath example //p[contains(., "payroll")] foreach($cg_post_strip_by_words as $cg_post_strip_by_word){ $camp_general['cg_feed_visual_strip'][] = '//p[contains(., "'.trim($cg_post_strip_by_word).'")]'; } } // strip parts using xpaths stored on $camp_general['cg_feed_visual_strip'] if (in_array ( 'OPT_STRIP_VISUAL', $camp_opt )) { //require class.dom.php require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); $cg_feed_visual_strip = array_filter( $camp_general ['cg_feed_visual_strip'] ); //if is array and not empty if (is_array($cg_feed_visual_strip) && !empty($cg_feed_visual_strip)) { $newContent = $wpAutomaticDom->removeElementsByXpath($cg_feed_visual_strip); //if new content is not empty then replace the original content if (!empty($newContent)) { $original_cont = $newContent; } } } // fix lazy loading if (in_array ( 'OPT_FEED_LAZY', $camp_opt )) { $cg_feed_lazy = trim ( $camp_general ['cg_feed_lazy'] ); if ($cg_feed_lazy == '') $cg_feed_lazy = 'data-src'; preg_match_all ( '{<img .*?>}s', $original_cont, $imgsMatchs ); $imgsMatchs = $imgsMatchs [0]; foreach ( $imgsMatchs as $imgMatch ) { if (stristr ( $imgMatch, $cg_feed_lazy )) { $newImg = $imgMatch; $newImg = str_replace ( ' src=', ' bad-src=', $newImg ); $newImg = preg_replace ( '{ bad-src=[\'|"].*?["|\'] }s', ' ', $newImg ); $newImg = str_replace ( $cg_feed_lazy, 'src', $newImg ); $original_cont = str_replace ( $imgMatch, $newImg, $original_cont ); } } } else { $original_cont = $this->lazy_loading_auto_fix ( $original_cont ); } // fix images $original_cont = wp_automatic_fix_relative_paths ( $original_cont, $url ); } // end-content-extraction // date if not existing if ( (in_array ( 'OPT_ORIGINAL_TIME', $camp_opt ) || in_array( 'OPT_YT_DATE' , $camp_opt) ) && ! isset ( $res ['wpdate'] ) ) { $found_date = ''; // ini echo '<br>Finding original date... '; $cg_original_time_regex = isset( $camp_general ['cg_original_time_regex'] ) ? trim ( $camp_general ['cg_original_time_regex'] ) : ''; // if regex is empty use default if ($cg_original_time_regex == '') { $cg_original_time_regex = '20\d{2}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d*)?.\d{2}:\d{2}'; } preg_match("!{$cg_original_time_regex}!s", $original_cont, $date_matches ); //timestamp instead data-time="1649495312">A if (isset ( $date_matches [0] ) && trim ( $date_matches [0] ) != '') { // nice found $found_date = $date_matches [0]; echo ' match found:'. $found_date; }else{ //not found lets find timestamp preg_match('!"(1\d{9})"!' , $original_cont , $timestamp_matches ); if(isset($timestamp_matches[1]) && trim($timestamp_matches[1]) != ''){ echo ' Found possible timestamp:'.$timestamp_matches[1]; $possible_date = date ( "Y-m-d H:i:s", $timestamp_matches[1] ); if(preg_match('!20\d{2}-!' , $possible_date) ){ echo ' approving:' . $possible_date; $found_date = $possible_date; } }else{ //extract using format 2023-03-25 08:57:07 preg_match('!20\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}!' , $original_cont , $date_matches ); if(isset($date_matches[0]) && trim($date_matches[0]) != ''){ echo ' Found possible date:'.$date_matches[0]; $found_date = $date_matches[0]; } } } if ( $found_date != '') { echo '<-- Found:' . $found_date; $res ['wpdate'] = $wpdate = date ( "Y-m-d H:i:s", strtotime ( $found_date ) ); // gmt date $publish_date = get_date_from_gmt ( $res ['wpdate'] ); $res ['publish_date'] = $publish_date; // check if older than minimum date if ( $wpdate != '' && $this->is_link_old ( $camp->camp_id, strtotime ( $wpdate ) )) { echo '<--old post execluding...'; $this->link_execlude ( $camp->camp_id, $original_url ); continue; } } else { echo '<- can not find'; } } // custom config for web-stories support. if the post type is web-story and the opt_full_feed is enabled then we need to remove it from camp_opt and add OPT_FEED_CUSTOM if (in_array ( 'OPT_FULL_FEED', $camp_opt ) && $camp->camp_post_type == 'web-story' && strpos ( $original_cont, '<amp-story' ) !== false ) { echo '<br>Web story detected...Extracing whole html'; // remove OPT_FULL_FEED $camp_opt = array_diff ( $camp_opt, array ( 'OPT_FULL_FEED' ) ); // add OPT_FEED_CUSTOM $camp_opt [] = 'OPT_FEED_CUSTOM'; // add custom config to simulate visual selector method and picking whole html $camp_general ['cg_feed_extraction_method'] = 'css'; $camp_general ['cg_feed_custom_id'] = array('/html'); $cg_feed_css_size = array (); $cg_feed_css_wrap = array (); $cg_custom_selector = array (); foreach ( $camp_general ['cg_feed_visual'] as $singleVisual ) { $cg_feed_css_size [] = 'single'; $cg_feed_css_wrap [] = 'outer'; $cg_custom_selector [] = 'xpath'; } $camp_general ['cg_feed_css_size'] = $cg_feed_css_size; $camp_general ['cg_feed_css_wrap'] = $cg_feed_css_wrap; $camp_general ['cg_custom_selector'] = $cg_custom_selector; } // FULL CONTENT if (in_array ( 'OPT_FULL_FEED', $camp_opt )) { // reset fullContentSuccess flag to true $this->fullContentSuccess = true; // test url // $url ="http://news.jarm.com/view/75431"; // get scripts $postponedScripts = array (); preg_match_all ( '{<script.*?</script>}s', $original_cont, $scriptMatchs ); $scriptMatchs = $scriptMatchs [0]; foreach ( $scriptMatchs as $singleScript ) { if (stristr ( $singleScript, 'connect.facebook' )) { $postponedScripts [] = $singleScript; } $original_cont = str_replace ( $singleScript, '', $original_cont ); } $x = curl_error ( $this->ch ); $url = curl_getinfo ( $this->ch, CURLINFO_EFFECTIVE_URL ); // Redability instantiate require_once 'inc/wp_automatic_readability/wp_automatic_Readability.php'; $wp_automatic_Readability = new wp_automatic_Readability ( $original_cont, $url ); $wp_automatic_Readability->debug = false; $result = $wp_automatic_Readability->init (); if ($result) { // Redability title $title = $wp_automatic_Readability->getTitle ()->textContent; // Redability Content $content = $wp_automatic_Readability->getContent ()->innerHTML; // twitter embed fix if (stristr ( $content, 'twitter.com' ) && ! stristr ( $content, 'platform.twitter' )) { $content .= '<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>'; } // Remove wp_automatic_Readability attributes $content = preg_replace ( '{ wp_automatic_Readability\=".*?"}s', '', $content ); // Fix iframe if exists preg_match_all ( '{<iframe[^<]*/>}s', $content, $ifrMatches ); $iframesFound = $ifrMatches [0]; foreach ( $iframesFound as $iframeFound ) { $correctIframe = str_replace ( '/>', '></iframe>', $iframeFound ); $content = str_replace ( $iframeFound, $correctIframe, $content ); } // add postponed scripts if (count ( $postponedScripts ) > 0) { $content .= implode ( '', $postponedScripts ); } // Cleaning redability for better memory unset ( $wp_automatic_Readability ); unset ( $result ); // Check existence of title words in the content $title_arr = explode ( ' ', $title ); $valid = ''; $nocompare = array ( 'is', 'Is', 'the', 'The', 'this', 'This', 'and', 'And', 'or', 'Or', 'in', 'In', 'if', 'IF', 'a', 'A', '|', '-' ); foreach ( $title_arr as $title_word ) { if (strlen ( $title_word ) > 3) { if (! in_array ( $title_word, $nocompare ) && preg_match ( '/\b' . preg_quote ( trim ( $title_word ), '/' ) . '\b/ui', $content )) { echo '<br>Title word ' . $title_word . ' exists on the content, approving.'; // echo $content; $valid = 'yeah'; break; } else { // echo '<br>Word '.$title_word .' does not exists'; } } } if (trim ( $valid ) != '') { $res ['cont'] = $content; $res ['matched_content'] = $content; $res ['og_img'] = ''; if (! in_array ( 'OPT_FEED_TITLE_NO', $camp_opt )) { $res ['title'] = $title; $res ['original_title'] = $title; } // let's find og:image may be the content we got has no image preg_match ( '{<meta[^<]*?property=["|\']og:image["|\'][^<]*?>}s', $html, $plain_og_matches ); if (isset ( $plain_og_matches [0] ) && @stristr ( $plain_og_matches [0], 'og:image' )) { preg_match ( '{content=["|\'](.*?)["|\']}s', $plain_og_matches [0], $matches ); $og_img = $matches [1]; if (trim ( $og_img ) != '') { $res ['og_img'] = $og_img; } } // If og:image } else { echo '<br>Can not make sure if the returned content is the full content, using excerpt instead.'; } } else { echo '<br>Looks like we couldn\'t find the full content. :( returning summary'; //set the fullContentSuccess flag to false, this flag is to set the post as pending if posting from a specific list of urls and auto detect method is enabled $this->fullContentSuccess = false; } // Class or ID extraction } elseif (in_array ( 'OPT_FEED_CUSTOM', $camp_opt )) { // Load dom require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); $cg_custom_selector = $camp_general ['cg_custom_selector']; $cg_feed_custom_id = $camp_general ['cg_feed_custom_id']; $cg_feed_custom_id = array_filter ( $cg_feed_custom_id ); $cg_feed_css_size = $camp_general ['cg_feed_css_size']; $cg_feed_css_wrap = $camp_general ['cg_feed_css_wrap']; echo '<br>Extracting content from original post for '; // test url // $url = "http://news.jarm.com/view/75431"; $wholeFound = ''; if (1) { $i = 0; foreach ( $cg_feed_custom_id as $cg_selecotr_data ) { $cg_selector = $cg_custom_selector [$i]; $cg_feed_css_size_s = $cg_feed_css_size [$i]; $cg_feed_css_wrap_s = $cg_feed_css_wrap [$i]; echo '<br>' . $cg_selector . ' = "' . $cg_selecotr_data . '"'; if ($cg_feed_css_wrap_s == 'inner') { $inner = true; } else { $inner = false; } if ($cg_selector == 'xpath') { $ret = $wpAutomaticDom->getContentByXPath ( stripslashes ( $cg_selecotr_data ), $inner ); } elseif ($cg_selector == 'class') { $ret = $wpAutomaticDom->getContentByClass ( $cg_selecotr_data, $inner ); } elseif ($cg_selector == 'id') { $ret = $wpAutomaticDom->getContentByID ( $cg_selecotr_data, $inner ); } $extract = ''; foreach ( $ret as $itm ) { $extract .= $itm; if ($cg_feed_css_size_s == 'single') { break; } } $rule_num = $i + 1; $res ['rule_' . $rule_num] = $extract; $res ['rule_' . $rule_num . '_plain'] = strip_tags ( $extract ); if (trim ( $extract ) == '') { echo '<br>Nothing found to extract for this rule'; } else { echo ' <-- ' . strlen ( $extract ) . ' chars extracted'; $wholeFound = (trim ( $wholeFound ) == '') ? $extract : $wholeFound . '<br>' . $extract; } $i ++; } if (trim ( $wholeFound ) != '') { $res ['cont'] = $wholeFound; $res ['matched_content'] = $wholeFound; } } else { echo '<br>could not parse the content returning summary'; } // REGEX EXTRACT } elseif (in_array ( 'OPT_FEED_CUSTOM_R', $camp_opt )) { echo '<br>Extracting content using REGEX '; $cg_feed_custom_regex = $camp_general ['cg_feed_custom_regex']; $finalmatch = ''; //all content found using REGEX $x=0; foreach ( $cg_feed_custom_regex as $cg_feed_custom_regex_single ) { $cg_feed_custom_regex_single = html_entity_decode ( $cg_feed_custom_regex_single ); if (trim ( $cg_feed_custom_regex_single ) != '') { $finalmatch2 = ''; // all content found using this REGEX rule // we have a regex echo '<br>Regex :' . htmlspecialchars ( $cg_feed_custom_regex_single ); // extracting if (trim ( $original_cont ) != '') { preg_match_all ( '!' . $cg_feed_custom_regex_single . '!is', $original_cont, $matchregex ); $c = 0; if(! isset($matchregex)) $matchregex = array(); //fix count(null) in next line for($i = 1; $i < count ( $matchregex ); $i ++) { foreach ( $matchregex [$i] as $newmatch ) { if (trim ( $newmatch ) != '') { //good we have a new extraction //json part to fix? "(.*?)" if (stristr($cg_feed_custom_regex_single , '"(.*?)' )){ echo '<-- Fixing JSON Part'; $suggestedFixedContent = wp_automatic_fix_json_part($newmatch); //overwriting if(trim($suggestedFixedContent) != ''){ echo '<-- Fix success overwriting '; $newmatch = $suggestedFixedContent; } } // update the final match for all found extraction from all rules if (trim ( $finalmatch2 ) != '') { $finalmatch2 .= '<br>' . $newmatch; } else { $finalmatch2 .= $newmatch; } if (trim ( $finalmatch ) != '') { $finalmatch .= '<br>' . $newmatch; } else { $finalmatch .= $newmatch; } } $c++; //break the loop if a single match only is needed if(in_array('OPT_FEED_REGEX_SINGLE_' . $i , $camp_opt)){ echo '<-- Single match break'; break; } } } $rule_num = $x + 1; $res ['rule_' . $rule_num] = $finalmatch2; $res ['rule_' . $rule_num . '_plain'] = strip_tags ( $finalmatch2 ); echo '<-- ' . strlen ( $finalmatch2 ) . ' chars found total of ' . $c . ' matches'; } else { echo '<br>Can not load original content.'; } } // rule not empty $x++; } // foreach rule if (trim ( $finalmatch ) != '') { // overwirte echo '<br>' . strlen ( $finalmatch ) . ' chars extracted using REGEX'; $res ['cont'] = $finalmatch; $res ['matched_content'] = $finalmatch; } else { echo '<br>Nothing extracted using REGEX using summary instead..'; } } //Issue fix no title but URL instead of the title when title is set to auto-detect if($camp_general ['cg_ml_ttl_method'] == 'auto' && stristr($title, 'http')){ echo '<br>Extracting a good title....'; $possible_title = $this->extract_title_auto($title,$original_cont); if(trim($possible_title) != ''){ echo '<-- Good title found'; $res['title'] = $title = $possible_title; }else{ echo '<-- No title found'; } } // redirect_url tag finding $redirect = ''; $redirect = curl_getinfo ( $this->ch, CURLINFO_EFFECTIVE_URL ); if (trim ( $redirect ) != '') { $res ['redirect_url'] = $redirect; } else { $res ['redirect_url'] = $res ['source_link']; } // Stripping content using id or class from $res[cont] if (in_array ( 'OPT_STRIP_CSS', $camp_opt )) { echo '<br>Stripping content using:- '; $cg_selector = $camp_general ['cg_custom_strip_selector']; $cg_selecotr_data = $camp_general ['cg_feed_custom_strip_id']; $cg_selecotr_data = array_filter ( $cg_selecotr_data ); // Load dom $final_doc = new DOMDocument (); // getting encoding preg_match_all ( '{charset=["|\']([^"]+?)["|\']}', $original_cont, $encMatches ); $possibleCharSet = $encMatches [1]; $possibleCharSet = isset ( $possibleCharSet [0] ) ? $possibleCharSet [0] : ''; if (trim ( $possibleCharSet ) == '') $possibleCharSet = 'UTF-8'; // overwrite to utf if already utf-8 if ($possibleCharSet != 'UTF-8' && function_exists ( 'mb_detect_encoding' ) && mb_detect_encoding ( $res ['cont'] ) == 'UTF-8') { echo '<br>Source encoding is ' . $possibleCharSet . ' but we still think it is utf-8 resetting...'; $possibleCharSet = 'UTF-8'; } $charSetMeta = '<meta http-equiv="content-type" content="text/html; charset=' . $possibleCharSet . '"/>'; $full_html = '<head>' . $charSetMeta . '</head><body>' . $res ['cont'] . '</body>'; @$final_doc->loadHTML ( $full_html ); $selector = new DOMXPath ( $final_doc ); $html_to_count = $final_doc->saveHTML ( $final_doc->documentElement ); $i = 0; $inner = false; foreach ( $cg_selecotr_data as $cg_selector_data_single ) { echo '<br> - ' . $cg_selector [$i] . ' = "' . $cg_selector_data_single . '" '; if (trim ( $cg_selector_data_single ) != '') { if ($cg_selector [$i] == 'class') { $query_final = '//*[contains(attribute::class, "' . trim ( $cg_selector_data_single ) . '")]'; } elseif ($cg_selector [$i] == 'id') { $query_final = "//*[@id='" . trim ( $cg_selector_data_single ) . "']"; } $countBefore = $this->chars_count ( $html_to_count ); foreach ( $selector->query ( $query_final ) as $e ) { $e->parentNode->removeChild ( $e ); } $html_to_count = $final_doc->saveHTML ( $final_doc->documentElement ); $countAfter = $this->chars_count ( $html_to_count ); echo '<-- ' . ($countBefore - $countAfter) . ' chars removed'; } $i ++; } $contentAfterReplacement = $final_doc->saveHTML ( $final_doc->documentElement ); $contentAfterReplacement = str_replace ( array ( '<html>', '</html>', '<body>', '</body>', $charSetMeta ), '', $contentAfterReplacement ); $contentAfterReplacement = preg_replace ( '{<head>.*?</head>}', '', $contentAfterReplacement ); $res ['cont'] = trim ( $contentAfterReplacement ); // overwirte $res ['matched_content'] = $res ['cont']; } // Stripping content using REGEX if (in_array ( 'OPT_STRIP_R', $camp_opt )) { $current_content = $res ['matched_content']; $current_title = $res ['title']; $cg_post_strip = html_entity_decode ( $camp_general ['cg_post_strip'] ); $cg_post_strip = explode ( "\n", $cg_post_strip ); $cg_post_strip = array_filter ( $cg_post_strip ); foreach ( $cg_post_strip as $strip_pattern ) { if (trim ( $strip_pattern ) != '') { // $strip_pattern ='<img[^>]+\\>'; echo '<br>Stripping using REGEX:' . htmlentities ( $strip_pattern ); $countBefore = $this->chars_count ( $current_content ); $current_content = preg_replace ( '{' . trim ( $strip_pattern ) . '}is', '', $current_content ); // replacing in rules $i = 1; while ( isset ( $res ["rule_$i"] ) ) { $res ["rule_$i"] = preg_replace ( '{' . trim ( $strip_pattern ) . '}is', '', $res ["rule_$i"] ); $i ++; } $countAfter = $this->chars_count ( $current_content ); echo ' <-- ' . ($countBefore - $countAfter) . ' chars removed'; $current_title = preg_replace ( '{' . trim ( $strip_pattern ) . '}is', '', $current_title ); } } if (trim ( $current_content ) != '') { $res ['matched_content'] = $current_content; $res ['cont'] = $current_content; } if (trim ( $current_title ) != '') { $res ['matched_title'] = $current_title; $res ['original_title'] = $current_title; $res ['title'] = $current_title; } } // end regex replace // if option active OPT_STRIP_IMGS then remove all images if (in_array ( 'OPT_STRIP_IMGS', $camp_opt )) { echo '<br>Stripping images...'; //count before removing $countBefore = $this->chars_count ( $res ['matched_content'] ); $res ['matched_content'] = preg_replace ( '{<img[^>]+\\>}', '', $res ['matched_content'] ); $res ['cont'] = preg_replace ( '{<img[^>]+\\>}', '', $res ['cont'] ); //count after removing $countAfter = $this->chars_count ( $res ['matched_content'] ); echo '<-- ' . ($countBefore - $countAfter) . ' chars removed'; // replacing in rules $i = 1; while ( isset ( $res ["rule_$i"] ) ) { $res ["rule_$i"] = preg_replace ( '{<img[^>]+\\>}', '', $res ["rule_$i"] ); $i ++; } } // strip tags if (in_array ( 'OPT_STRIP_T', $camp_opt )) { echo '<br>Stripping html tags...'; $cg_allowed_tags = trim ( $camp_general ['cg_allowed_tags'] ); if (! stristr ( $cg_allowed_tags, '<script' )) { $res ['matched_content'] = preg_replace ( '{<script.*?script>}s', '', $res ['matched_content'] ); $res ['cont'] = preg_replace ( '{<script.*?script>}s', '', $res ['cont'] ); $res ['matched_content'] = preg_replace ( '{<noscript.*?noscript>}s', '', $res ['matched_content'] ); $res ['cont'] = preg_replace ( '{<noscript.*?noscript>}s', '', $res ['cont'] ); } $res ['matched_content'] = strip_tags ( $res ['matched_content'], $cg_allowed_tags ); $res ['cont'] = strip_tags ( $res ['cont'], $cg_allowed_tags ); } // validate content size // MUST CONTENT if (in_array ( 'OPT_MUST_CONTENT', $camp_opt )) { if (trim ( $res ['cont'] ) == '') { echo '<--No content excluding'; $this->link_execlude ( $camp->camp_id, $original_url ); continue; } } // limit content if (in_array ( 'OPT_MIN_LENGTH', $camp_opt ) || in_array ( 'OPT_MAX_LENGTH', $camp_opt ) ) { $contentTextual = preg_replace ( '{<script.*?script>}s', '', $res ['cont'] ); $contentTextual = strip_tags ( $contentTextual ); $contentTextual = str_replace ( ' ', '', $contentTextual ); if (function_exists ( 'mb_strlen' )) { $contentLength = mb_strlen ( $contentTextual ); } else { $contentLength = strlen ( $contentTextual ); } unset ( $contentTextual ); echo '<br>Content length:' . $contentLength; if( in_array ( 'OPT_MIN_LENGTH', $camp_opt ) ){ if ($contentLength < $camp_general ['cg_min_length']) { echo '<--Shorter than the minimum('. $camp_general ['cg_min_length'] . ')... Excluding'; $this->link_execlude ( $camp->camp_id, $original_url ); continue; } else { echo '<--Valid Min length.. i.e > ('. $camp_general ['cg_min_length'] .') '; } } if( in_array ( 'OPT_MAX_LENGTH', $camp_opt ) ){ if ($contentLength > $camp_general ['cg_max_length']) { echo '<--Longer than the maximum('.$camp_general ['cg_max_length'].')... Excluding'; $this->link_execlude ( $camp->camp_id, $original_url ); continue; } else { echo '<--Valid Max length i.e < ('. $camp_general ['cg_max_length'] .')'; } } } // Entity decode if (in_array ( 'OPT_FEED_ENTITIES', $camp_opt )) { echo '<br>Decoding html entities'; // php 5.3 and lower convert to invalid charchters that broke everything $res ['original_title'] = str_replace ( ' ', ' ', $res ['original_title'] ); $res ['matched_content'] = str_replace ( ' ', ' ', $res ['matched_content'] ); $res ['original_title'] = html_entity_decode ( $res ['original_title'], ENT_QUOTES | ENT_HTML401 ); $res ['title'] = html_entity_decode ( $res ['title'], ENT_QUOTES | ENT_HTML401 ); $res ['matched_content'] = html_entity_decode ( $res ['matched_content'], ENT_QUOTES | ENT_HTML401 ); $res ['cont'] = $res ['matched_content']; } // end entity decode // Clean googleads and <script tag $res ['cont'] = preg_replace ( '{<ins.*?ins>}s', '', $res ['cont'] ); $res ['cont'] = preg_replace ( '{<ins.*?>}s', '', $res ['cont'] ); if (! in_array ( 'OPT_FEED_SCRIPT', $camp_opt )) $res ['cont'] = preg_replace ( '{<script.*?script>}s', '', $res ['cont'] ); $res ['cont'] = preg_replace ( '{\(adsbygoogle.*?\);}s', '', $res ['cont'] ); $res ['matched_content'] = $res ['cont']; // meta tags if (in_array ( 'OPT_ORIGINAL_META', $camp_opt )) { // extract all metas preg_match_all ( '{<meta.*?>}s', $original_cont, $metaMatches ); $allMeta = $metaMatches [0]; foreach ( $allMeta as $singleMeta ) { if (stristr ( $singleMeta, 'keywords' )) { if (preg_match ( '{name[\s]?=[\s]?["\']keywords["\']}', $singleMeta )) { preg_match_all ( '{content[\s]?=[\s]?[\'"](.*?)[\'"]}s', $singleMeta, $realTagsMatches ); $realTagsMatches = $realTagsMatches [1]; if (trim ( $realTagsMatches [0] ) != '') { echo '<br>Meta tags:' . $realTagsMatches [0]; $res ['tags'] = $realTagsMatches [0]; } } } } } // Extract cats from original source if (in_array ( 'OPT_ORIGINAL_CATS', $camp_opt ) && trim ( $camp_general ['cg_feed_custom_id_cat'] ) != '') { echo '<br>Extracting original post categories '; $cg_selector_cat = $camp_general ['cg_custom_selector_cat']; $cg_selecotr_data_cat = $camp_general ['cg_feed_custom_id_cat']; $inner = false; echo ' for ' . $cg_selector_cat . ' = ' . $cg_selecotr_data_cat; // dom class if (! isset ( $wpAutomaticDom )) { require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); } if (1) { $extract = ''; if ($cg_selector_cat == 'class') { $extract = $wpAutomaticDom->getContentByClass ( $cg_selecotr_data_cat, $inner ); } elseif ($cg_selector_cat == 'id') { $extract = $wpAutomaticDom->getContentByID ( $cg_selecotr_data_cat, $inner ); } elseif ($cg_selector_cat == 'xpath') { $extract = $wpAutomaticDom->getContentByXPath ( stripslashes ( $cg_selecotr_data_cat ), $inner ); } if (is_array ( $extract )) { if (in_array ( 'OPT_SELECTOR_SINGLE_CAT', $camp_opt )) { $extract = $extract [0]; } else { $extract = implode ( ' ', $extract ); } } if (trim ( $extract ) == '') { echo '<br>Nothing found to extract for this category rule'; } else { echo '<br>Cat Rule extracted ' . strlen ( $extract ) . ' charchters '; if (stristr ( $extract, '<a' )) { preg_match_all ( '{<a .*?>(.*?)</a}su', $extract, $cats_matches ); $cats_founds = $cats_matches [1]; $cat_founds = array_map ( 'strip_tags', $cats_founds ); if(in_array('OPT_ORIGINAL_CATS_REPLACE' , $camp_opt)){ $post_cat_replace = trim($camp_general['cg_cat_replace']); $post_cat_replace_arr = array_filter( explode("\n" , $post_cat_replace ) ); foreach($post_cat_replace_arr as $post_cat_replace_arr_single){ if(stristr( $post_cat_replace_arr_single , '|')){ $post_cat_replace_arr_single = trim($post_cat_replace_arr_single); $post_cat_replace_arr_single_arr = explode('|' , $post_cat_replace_arr_single); echo '<br>Replacing ' . $post_cat_replace_arr_single_arr[0] . ' with ' . $post_cat_replace_arr_single_arr[1] . ' in found categories...'; foreach($cat_founds as $key => $val){ if(stristr($val, ';')) $val = html_entity_decode($val); if(trim($val) == trim($post_cat_replace_arr_single_arr[0])){ $cat_founds[$key] = $post_cat_replace_arr_single_arr[1]; echo '<-- found'; } } } } } $cats_str = implode ( ',', $cat_founds ); echo ' found cats:' . $cats_str; $res ['cats'] = $cats_str; } } } } // Extract tags from original source if (in_array ( 'OPT_ORIGINAL_TAGS', $camp_opt ) && trim ( $camp_general ['cg_feed_custom_id_tag'] ) != '' && trim($original_cont) != '') { echo '<br>Extracting original post tags '; $cg_selector_tag = $camp_general ['cg_custom_selector_tag']; $cg_selecotr_data_tag = $camp_general ['cg_feed_custom_id_tag']; $inner = false; echo ' for ' . $cg_selector_tag . ' = ' . $cg_selecotr_data_tag; // dom class if (! isset ( $wpAutomaticDom )) { require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); } if (1) { $extract = ''; if ($cg_selector_tag == 'class') { $extract = $wpAutomaticDom->getContentByClass ( $cg_selecotr_data_tag, $inner ); } elseif ($cg_selector_tag == 'id') { $extract = $wpAutomaticDom->getContentByID ( $cg_selecotr_data_tag, $inner ); } elseif ($cg_selector_tag == 'xpath') { $extract = $wpAutomaticDom->getContentByXPath ( stripslashes ( $cg_selecotr_data_tag ), $inner ); } if (is_array ( $extract )) { if (in_array ( 'OPT_SELECTOR_SINGLE_TAG', $camp_opt )) { $extract = $extract [0]; } else { $extract = implode ( ' ', $extract ); } } if (trim ( $extract ) == '') { echo '<br>Nothing found to extract for this tag rule'; } else { echo '<br>Tag Rule extracted ' . strlen ( $extract ) . ' charchters '; if (stristr ( $extract, '<a' )) { preg_match_all ( '{<a .*?>(.*?)</a}su', $extract, $tags_matches ); $tags_founds = array(); //init $tags_founds = $tags_matches [1]; $tags_founds = array_map ( 'strip_tags', $tags_founds ); $tags_founds = array_map('trim', $tags_founds); // if option OPT_ORIGINAL_TAGS_REPLACE is set replace the tags if (in_array('OPT_ORIGINAL_TAGS_REPLACE', $camp_opt)) { // replace tags if (trim($camp_general['cg_tag_replace']) != '') { $post_tag_replace_arr = explode("\n", $camp_general['cg_tag_replace']); //report count of available tags echo '<br> - ' . count($tags_founds) . ' tags found: ' . implode(', ', $tags_founds); foreach ($post_tag_replace_arr as $post_tag_replace) { $post_tag_replace = trim($post_tag_replace); if (trim($post_tag_replace) != '') { $post_tag_replace_arr = explode('|', $post_tag_replace); $tag_from = trim($post_tag_replace_arr[0]); $tag_to = trim($post_tag_replace_arr[1]); echo '<br> - Replacing tag ' . $tag_from . ' with ' . $tag_to; $tags_founds = str_replace($tag_from, $tag_to, $tags_founds); } } } } $tags_str = implode ( ',', $tags_founds ); echo '<br>Found tags:' . $tags_str; $res ['tags'] = $tags_str; } } } } elseif (in_array ( 'OPT_ORIGINAL_TAGS', $camp_opt )) { echo '<br>You must add a valid ID/Class to Extract tags, No tags will get extracted.'; } // extract tags // extract author from original source if (in_array ( 'OPT_ORIGINAL_AUTHOR', $camp_opt ) && trim ( $camp_general ['cg_feed_custom_id_author'] ) != '') { echo '<br>Extracting original post author '; $cg_selector_author = $camp_general ['cg_custom_selector_author']; $cg_selecotr_data_author = $camp_general ['cg_feed_custom_id_author']; $inner = false; echo ' for ' . $cg_selector_author . ' = ' . $cg_selecotr_data_author; // dom class if (! isset ( $wpAutomaticDom )) { require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); } if (1) { $extract = ''; if ($cg_selector_author == 'class') { $extract = $wpAutomaticDom->getContentByClass ( $cg_selecotr_data_author, $inner ); } elseif ($cg_selector_author == 'id') { $extract = $wpAutomaticDom->getContentByID ( $cg_selecotr_data_author, $inner ); } elseif ($cg_selector_author == 'xpath') { $extract = $wpAutomaticDom->getContentByXPath ( stripslashes ( $cg_selecotr_data_author ), $inner ); } if (is_array ( $extract )) { if (in_array ( 'OPT_SELECTOR_SINGLE_AUTHOR', $camp_opt )) { $extract = $extract [0]; } else { $extract = implode ( ' ', $extract ); } } // Validate returned author if (trim ( $extract ) == '') { echo '<br>Nothing found to extract for this author rule'; } else { echo '<br>author Rule extracted ' . strlen ( $extract ) . ' charchters '; if (stristr ( $extract, '<a' )) { preg_match_all ( '{<a .*?>(.*?)</a}su', $extract, $author_matches ); $author_founds = $author_matches [1]; $author_str = strip_tags ( $author_founds [0] ); echo ' Found author:' . $author_str; $res ['author'] = $author_str; }else{ //no links $author_str = strip_tags ( $extract ); if(strlen($author_str) < 100){ echo ' Found author:' . $author_str; $res ['author'] = $author_str; } } } } } elseif (in_array ( 'OPT_ORIGINAL_AUTHOR', $camp_opt )) { if (trim ( $res ['author'] ) == '') { echo '<br>You did not set a specific config to Extract Author, The plugin will try to auto-detect... '; // try to auto detect author using extract_author_auto if(isset($original_cont_raw)){ $author_str = $this->extract_author_auto ( $original_cont_raw ); }else{ $author_str = $this->extract_author_auto ( $original_cont ); } if (trim ( $author_str ) != '') { echo '<br>Auto detected author:' . $author_str; $res ['author'] = $author_str; }else{ echo '<br>Auto detection failed'; } } } // extract author if (! in_array ( 'OPT_ENCLUSURE', $camp_opt )) { if (isset ( $media_image_url ) && ! stristr ( $res ['cont'], '<img' )) { echo '<br>enclosure image:' . $media_image_url; $res ['cont'] = '<img src="' . $media_image_url . '" /><br>' . $res ['cont']; } } // Part to custom field OPT_FEED_PTF // Extracted custom fields ini $customFieldsArr = array (); $ruleFields = array (); // fields names If (in_array ( 'OPT_FEED_PTF', $camp_opt )) { echo '<br>Specific Part to custom field extraction'; // Load rules $cg_part_to_field = trim ( html_entity_decode ( $camp_general ['cg_part_to_field'] ) ); $cg_part_to_field_parts = explode ( "\n", $cg_part_to_field ); // Process rules foreach ( $cg_part_to_field_parts as $cg_part_to_field_part ) { echo '<br>Rule:' . htmlentities ( $cg_part_to_field_part ); // Validate format | if (! stristr ( $cg_part_to_field_part, '|' )) { echo '<- Wrong format...'; continue; } // Parse rule $rule_parts = explode ( '|', $cg_part_to_field_part ); $rule_single = 0; //ini //case regex with | used as alternatives if(count($rule_parts) > 3 && $rule_parts [0] == 'regex' ){ $rule_method = trim ( $rule_parts [0] ); //first thing $rule_field = trim(end($rule_parts)); // case |1 for the single value instead of many if(trim($rule_field) == 1){ $rule_single = 1; array_pop($rule_parts); // remove the 1 from the array $rule_field = trim(end($rule_parts)); // field name is now the last child } $rule_value_parts = array(); for($i = 1; $i < count($rule_parts) -1 ; $i++){ $rule_value_parts[] = $rule_parts[$i]; } $rule_value = implode('|' , $rule_value_parts ); }else{ $rule_method = trim ( $rule_parts [0] ); $rule_value = trim ( $rule_parts [1] ); $rule_field = trim ( $rule_parts [2] ); //set single or all if(isset($rule_parts [3]) && trim($rule_parts [3]) == 1) $rule_single = 1; } // Validate rule if (trim ( $rule_method ) == '' || trim ( $rule_value ) == '' || trim ( $rule_field ) == '') { echo '<- Wrong format...'; continue; } $ruleFields [] = $rule_field; // Validate rule method: class,id,regex,xpath if ($rule_method != 'id' && $rule_method != 'class' && $rule_method != 'regex' && $rule_method != 'xpath') { echo '<- Wrong Method:' . $rule_method; continue; } // id,class,xPath if ($rule_method == 'id' || $rule_method == 'class' || $rule_method == 'xpath') { // Dom object $doc = new DOMDocument (); // Load Dom @$doc->loadHTML ( $original_cont ); // xPath object $xpath = new DOMXPath ( $doc ); // xPath query if ($rule_method != 'xpath') { $query = "//*[@" . $rule_method . "='" . $rule_value . "']"; echo '<-- query:'.$query; $xpathMatches = $xpath->query ( "//*[@" . $rule_method . "='" . $rule_value . "']" ); } else { $xpathMatches = $xpath->query ( "$rule_value" ); } echo '<-- ' . count($xpathMatches) . ' matches found'; // Single item ? if ($rule_single) { $xpathMatches = array ( $xpathMatches->item ( 0 ) ); } // Rule result ini $rule_result = ''; foreach ( $xpathMatches as $xpathMatch ) { if ($rule_field == 'tags' || $rule_field == 'categories' || stristr($rule_field, 'taxonomy_')) { $rule_result .= ',' . $xpathMatch->nodeValue; } else { $rule_result .= $xpathMatch->nodeValue; } } // Store field to be added if (trim ( $rule_result ) != '') { echo ' <--' . $this->chars_count ( $rule_result ) . ' chars extracted'; if ($rule_field == 'categories') { $res ['cats'] = $rule_result; } else { $customFieldsArr [] = array ( $rule_field, $rule_result ); } } else { echo '<-- nothing found'; } } else { // Regex extract $matchregex = array (); $finalmatch = ''; //echo ' the rule value ' . $rule_value; // Match preg_match_all ( '{' . trim ( $rule_value ) . '}is', $original_cont, $matchregex ); if(isset($matchregex[1])){ echo '<-- ' . count($matchregex[1]) . ' matches '; }else{ echo '<-- Added REGEX seems not to contain (.*?) the part to really parse, please add it'; } $matchregex_vals = isset($matchregex [1])? $matchregex [1] : array() ; if (isset ( $matchregex [2] )) $matchregex_vals = array_merge ( $matchregex_vals, $matchregex [2] ); // single match if ($rule_single && count ( $matchregex_vals ) > 1) { $matchregex_vals = array ( $matchregex_vals [0] ); } // Read matches foreach ( $matchregex_vals as $newmatch ) { //fix json part //json part to fix? "(.*?)" if (stristr($rule_value , '"(.*?)' )){ echo '<-- Fixing JSON Part'; $suggestedFixedContent = wp_automatic_fix_json_part($newmatch); //overwriting if(trim($suggestedFixedContent) != ''){ echo '<-- Fix success overwriting '; $newmatch = $suggestedFixedContent; } } if (trim ( $newmatch ) != '') { if (trim ( $finalmatch ) == '') { $finalmatch .= '' . $newmatch; } else { if ($rule_field == 'tags' || $rule_field == 'categories' || stristr($rule_field, 'taxonomy_') ) { $finalmatch .= ',' . $newmatch; } else { $finalmatch .= $newmatch; } } } } // Store field to be added if (trim ( $finalmatch ) != '') { echo ' <--' . $this->chars_count ( $finalmatch ) . ' chars extracted'; if ($rule_field == 'categories') { $res ['cats'] = $res ['categories_to_set'] = $finalmatch; } else { $customFieldsArr [] = array ( $rule_field, $finalmatch ); } } else { echo '<-- Nothing found'; } } } // foreach rule } // if part to field enabled $res ['custom_fields'] = $customFieldsArr; foreach ( $ruleFields as $field_name ) { $field_value = ''; // ini foreach ( $customFieldsArr as $fieldsArray ) { if ($fieldsArray [0] == $field_name) { $field_value = $fieldsArray [1]; } } $res [$field_name] = $field_value; } // og:image check // $url="kenh14.vn/kham-pha/5-hoi-chung-benh-ky-quac-nghe-thoi-cung-toat-mo-hoi-hot-20151219200950753.chn"; $currentOgImage = ''; // for og:image found check if (in_array ( 'OPT_FEEDS_OG_IMG', $camp_opt )) { // getting the og:image // let's find og:image // if no http $original_cont = str_replace ( 'content="//', 'content="http://', $original_cont ); echo '<br>Extracting og:image...'; // let's find og:image may be the content we got has no image preg_match ( '{<meta[^<]*?(?:property|name)=["|\']og:image["|\'][^<]*?>}s', $original_cont, $plain_og_matches ); if (isset ( $plain_og_matches [0] ) && stristr ( $plain_og_matches [0], 'og:image' )) { preg_match ( '{content=["|\'](.*?)["|\']}s', $plain_og_matches [0], $matches ); $og_img = $matches [1]; if (trim ( $og_img ) != '') { $og_img_short = preg_replace ( '{http://.*?/}', '', $og_img ); echo $og_img_short; if (trim ( $og_img_short ) == '') { $og_img_short = $og_img; } // get og_title preg_match_all ( '/<img .*>/', $original_cont, $all_images ); $all_images = $all_images [0]; $foundAlt = ''; // ini foreach ( $all_images as $single_image ) { if (stristr ( $single_image, $og_img_short )) { // extract alt text preg_match ( '/alt=["|\'](.*?)["|\']/', $single_image, $alt_matches ); $foundAlt = (isset ( $alt_matches [1] )) ? $alt_matches [1] : ''; } } $res ['og_img'] = $og_img; // no http format but //tec if (! stristr ( $og_img, 'http' ) && stristr ( $og_img, '//' )) { $res ['og_img'] = 'https:' . $og_img; } $res ['og_img'] = trim($res ['og_img'] ); $res ['og_alt'] = $foundAlt; $currentOgImage = $og_img; } }else{ // if a webstory is found, get the image from the poster-portrait-src attribute if(stristr($original_cont, '<amp-story') && stristr($original_cont, 'poster-portrait-src') ){ preg_match_all ( '/poster-portrait-src=["|\'](.*?)["|\']/', $original_cont, $webstory_matches ); $webstory_matches = $webstory_matches[1]; if(isset($webstory_matches[0])){ echo '<br>Webstory found, getting image from poster-portrait-src attribute: '.$webstory_matches[0].''; $res ['og_img'] = $webstory_matches[0]; $res ['og_alt'] = ''; $og_alt = ''; } } } } // fix FB embeds if (stristr ( $res ['cont'], 'fb-post' ) && ! stristr ( $res ['cont'], 'connect.facebook' )) { echo '<br>Possible Facebook embeds found adding embed scripts'; $res ['cont'] .= '<div id="fb-root"></div> <script async defer src="https://connect.facebook.net/en_US/sdk.js#xfbml=1&version=v3.2"></script>'; } // fix twitter embeds if (stristr ( $res ['cont'], 'twitter.com' ) && ! stristr ( $res ['cont'], 'platform.twitter' )) { echo '<br>Possible tweets found without twitter JS. adding JS'; $res ['cont'] .= '<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'; } // fix instagram.com embeds if (stristr ( $res ['cont'], 'instagram.com' ) && ! stristr ( $res ['cont'], 'platform.instagram' )) { echo '<br>Possible Instagram embeds found without JS. adding JS'; $res ['cont'] .= '<script async defer src="https://platform.instagram.com/en_US/embeds.js"></script>'; } // fix youtube no height embeds if (stristr ( $res ['cont'], 'youtube.com/embed' )) { preg_match_all ( '{<iframe[^>]*?youtube.com/embed/(.*?)["|\?].*?>(?:</iframe>)?}', $res ['cont'], $yt_matches ); $yt_matches_full = $yt_matches [0]; $yt_matches_ids = $yt_matches [1]; if (count ( $yt_matches_full ) > 0) { $i = 0; foreach ( $yt_matches_full as $embed ) { echo '<br>Youtube video embed format changed to WordPress for video :' . $yt_matches_ids [$i]; $res ['cont'] = str_replace ( $embed, '[embed]https://www.youtube.com/watch?v=' . $yt_matches_ids [$i] . '[/embed]', $res ['cont'] ); $i ++; } } } // check if image or not if (in_array ( 'OPT_MUST_IMAGE', $camp_opt ) && ! stristr ( $res ['cont'], '<img' ) && trim ( $currentOgImage ) == '') { echo '<br>Post contains no images skipping it ...'; // Excluding it $this->link_execlude ( $camp->camp_id, $original_url ); } else { // og image fix if (isset ( $res ['og_img'] ) && trim ( $res ['og_img'] ) != '') { $og_img = $res ['og_img']; // make sure it has the domain if (! stristr ( $og_img, 'http:' )) { if (stristr ( $og_img, '//' )) { $og_img = 'http:' . $og_img; } else { $host = parse_url ( $url, PHP_URL_HOST ); // no domain at all $og_img = '/' . $og_img; $og_img = str_replace ( '//', '/', $og_img ); $og_img = 'http://' . $host . $og_img; $res ['og_img'] = $og_img; } } } // og title or custom title extraction method if (in_array ( 'OPT_FEED_OG_TTL', $camp_opt ) || (isset ( $camp_general ['cg_ml_ttl_method'] ) && trim ( $camp_general ['cg_ml_ttl_method'] ) != 'auto')) { if (in_array ( 'OPT_FEED_OG_TTL', $camp_opt )) { // let's find og:title may be the content we got has no preg_match ( '{<meta[^<]*?property=["|\']og:title["|\'][^<]*?>}s', $original_cont, $plain_og_matches ); if (@stristr ( $plain_og_matches [0], 'og:title' )) { preg_match ( '{content[\s]*=[\s]*"(.*?)"}s', $plain_og_matches [0], $matches ); $og_ttl = $matches [1]; echo '<br>og:title:' . html_entity_decode ( htmlspecialchars_decode ( $og_ttl ) ); if (trim ( $og_ttl ) != '') { $og_ttl = htmlspecialchars_decode ( $og_ttl, ENT_QUOTES ); $res ['title'] = html_entity_decode ( $og_ttl ); } } // If og:title } else { // custom extraction method $cg_ml_ttl_method = $camp_general ['cg_ml_ttl_method']; require_once 'inc/class.dom.php'; $wpAutomaticDom = new wpAutomaticDom ( $original_cont ); $finalContent = ''; if ($cg_ml_ttl_method == 'visual') { $cg_ml_visual = $camp_general ['cg_ml_visual']; $path = isset ( $cg_ml_visual [0] ) ? $cg_ml_visual [0] : ''; if (trim ( $path ) == '') { echo '<br>No path found for pagination, please set the extraction rule for pagination if you want to make use of pagination.'; } foreach ( $cg_ml_visual as $xpath ) { echo '<br>Getting title for XPath:' . $path; try { $finalContent = $wpAutomaticDom->getContentByXPath ( $xpath, false ); if (is_array ( $finalContent )) $finalContent = implode ( '', $finalContent ); echo '<-- ' . strlen ( $finalContent ) . ' chars'; } catch ( Exception $e ) { echo '<br>Error:' . $e->getMessage (); } } } elseif ($cg_ml_ttl_method == 'css') { $cg_ml_css_type = $camp_general ['cg_ml_css_type']; $cg_ml_css = $camp_general ['cg_ml_css']; $cg_ml_css_wrap = $camp_general ['cg_ml_css_wrap']; $cg_ml_css_size = $camp_general ['cg_ml_css_size']; $finalContent = ''; $i = 0; foreach ( $cg_ml_css_type as $singleType ) { if ($cg_ml_css_wrap [$i] == 'inner') { $inner = true; } else { $inner = false; } echo '<br>Extracting content by ' . $cg_ml_css_type [$i] . ' : ' . $cg_ml_css [$i]; if ($cg_ml_css_type [$i] == 'class') { $content = $wpAutomaticDom->getContentByClass ( $cg_ml_css [$i], $inner ); } elseif ($cg_ml_css_type [$i] == 'id') { $content = $wpAutomaticDom->getContentByID ( $cg_ml_css [$i], $inner ); } elseif ($cg_ml_css_type [$i] == 'xpath') { $content = $wpAutomaticDom->getContentByXPath ( stripslashes ( $cg_ml_css [$i] ), $inner ); } if (is_array ( $content )) { if ($cg_ml_css_size [$i] == 'single') { $content = $content [0]; } else { $content = implode ( "\n", $content ); } $finalContent .= $content . "\n"; } echo '<-- ' . strlen ( $content ) . ' chars'; $i ++; } // foreach rule } elseif ($cg_ml_ttl_method == 'regex') { $cg_ml_regex = $camp_general ['cg_ml_regex']; $finalContent = ''; $i = 0; foreach ( $cg_ml_regex as $cg_ml_regex_s ) { echo '<br>Extracting content by REGEX : ' . htmlspecialchars ( html_entity_decode ( $cg_ml_regex_s ) ); $content = $wpAutomaticDom->getContentByRegex ( html_entity_decode ( $cg_ml_regex_s ) ); $content = implode ( $content ); echo '<-- ' . strlen ( $content ) . ' chars'; if (trim ( $content ) != '') { $finalContent .= $content . "\n"; } $i ++; } } } //set title from extraction if available if(isset($finalContent) && trim($finalContent) != ''){ if(! in_array('OPT_FEED_NO_DECODE', $camp_opt)){ $possibleTitle = ''; if (trim ( $finalContent ) != '') { $possibleTitle = strip_tags ( $finalContent ); } if (trim ( $possibleTitle ) != '') { $res ['original_title'] = html_entity_decode ( $possibleTitle, ENT_QUOTES | ENT_HTML401 ); $res ['title'] = html_entity_decode ( $possibleTitle, ENT_QUOTES | ENT_HTML401 ); } }else{ $res ['original_title'] = $res ['title'] = $finalContent; } } } return $res; } } else { // duplicated link echo ' <-- duplicate in post <a href="' . admin_url ( 'post.php?post=' . $this->duplicate_id . '&action=edit' ) . '">#' . $this->duplicate_id . '</a>'; } //top post only if(in_array('OPT_FEED_TOP' , $camp_opt)) break; endforeach ; echo '<br>End of feed items reached.'; if (isset ( $camp->camp_sub_type ) && $camp->camp_sub_type == 'Multi') { delete_post_meta ( $camp->camp_id, 'wp_automatic_cache' ); } else { // Set isItemsEndReached flag to yes update_post_meta ( $camp->camp_id, $feedMd5 . '_isItemsEndReached', 'yes' ); } } // end function function extract_title_auto($title, $cont){ //<h1> if( substr_count($cont, '<h1') == 1 ){ //get h1 title preg_match('!<h1.*?>(.*?)</h1>!', $cont, $title_matchs); if( trim($title_matchs[1]) != ''){ $title = $title_matchs[1]; echo 'H1:'. $title; return $title; } } //<title> if(stristr($cont, '<title>')){ preg_match('!<title>(.*?)</title>!', $cont, $title_matchs); if( trim($title_matchs[1]) != ''){ $title = $title_matchs[1]; echo 'title tag:'. $title; return $title; } } //"og:title" content="Stripe and Plaid suit up for battle – TechCrunch" if(stristr($cont, 'og:title')){ preg_match('!og:title" content="(.*?)"!', $cont, $title_matchs); if( trim($title_matchs[1]) != ''){ $title = $title_matchs[1]; echo 'og:title:'. $title; return $title; } } } /** * Function takes the original HTML and try to find the author name * @param string $cont * @return string $author */ function extract_author_auto($cont){ $author = ''; //<meta name="author" content="John Doe"> if(stristr($cont, '<meta name="author"')){ preg_match('!<meta name="author" content="(.*?)"!', $cont, $author_matchs); if( trim($author_matchs[1]) != ''){ $author = $author_matchs[1]; echo 'author meta:'. $author; return $author; } } //"@type": "Person", "name": "Rizwan Choudhury" //"@type":"Person","name":"Holly Ellyatt" //"@type":"Person","@id":"https://valvepress.com/#/schema/person/2e36bfac540bde918e245be55c976600","name":"Atef" if(stristr($cont, '"Person"') && stristr($cont, '"@type"') ){ preg_match_all('!"@type"\s*:\s*"Person".*?"name"\s*:\s*"(.*?)"!s', $cont, $author_matchs); if(isset($author_matchs[1][0])){ $author = $author_matchs[1][0]; echo 'author schema:'. $author; return $author; } } //"author":"Duncan Riley" if(stristr($cont, '"author"')){ preg_match_all('!"author"\s*:\s*"(.*?)"!', $cont, $author_matchs); if(isset($author_matchs[1][0])){ $author = $author_matchs[1][0]; echo 'author json:'. $author; return $author; } } //<span class="author">Duncan Riley</span> if(stristr($cont, '<span class="author">')){ preg_match_all('!<span class="author">(.*?)</span>!', $cont, $author_matchs); if(isset($author_matchs[1][0])){ $author = $author_matchs[1][0]; echo 'author span:'. $author; return $author; } } } /** * Function takes Google news link and returns the original link * Firstly it checks the cache if table automatic_links contains a record with link_url = md5($link) then the value will be the column link_title * If not found then it will make an api_call to google to get the original link * it will then cache the result in the table automatic_links for future use *@param string $link *@return string $link @throws Exception if the api call fails or the link is not found in google news */ function get_google_news_link($link) { global $wpdb; //check if the link is already cached $md5 = md5($link); $cache = $wpdb->get_row("select * from {$wpdb->prefix}automatic_links where link_url = '$md5' "); if( isset($cache->link_url) > 0 ){ //found in cache $link = $cache->link_title; echo ' <span style="color:blue">[cached]</span>'; }else{ //not found in cache $url = $link; try{ $newUrl = $this->api_call('googleNewsLinkExtractor', array('googleNewsLink' => $url)); if(trim($newUrl) != ''){ $link = $url = $newUrl; echo ' <span style="color:green">[new]</span>'; //cache the result $wpdb->insert("{$wpdb->prefix}automatic_links", array( 'link_url' => $md5, 'link_title' => $link )); } }catch(Exception $e){ throw new Exception('Error in google news link extractor: '.$e->getMessage()); } } return $link; } }