////// Grab webpage //////
function webFetcher($url)
{
$crawl = curl_init();
curl_setopt ($crawl, CURLOPT_URL, $url);
curl_setopt($crawl, CURLOPT_RETURNTRANSFER, 1);
$resulting = $resulting.curl_exec($crawl);
curl_close($crawl);
return $result = $resulting;
}
////////////////////////////////
//// extract links ////
function extract_links($text) {
preg_match_all(‘/<\s*a[^<>]*?href=[\’”]?([^\s<>\’”]*)[\’”]?[^<>]*>(.*?)<\/a>/si’,
$text,
$match_array,
PREG_SET_ORDER);
$return = array() ;
foreach ($match_array as $serp) {
$full_anchor = $serp[0];
$href = $serp[1];
$anchortext = $serp[2];
if ( (preg_match(“/http:/i”,$href)) &&
(!preg_match(“/cache/i”,$href)) &&
(!preg_match(“/google.com/i”,$href)) &&
(!preg_match(“/youtube.com/i”,$href)) &&
(!preg_match(“/wikipedia.org/i”,$href)) &&
($href[0]!= ‘/’) ) {
$anchor_array = array($href,$anchortext) ;
array_push($return,$anchor_array) ;
}
}
return $return ;
}
/////////////////////////