James Bachini

scraping

  • Two php functions for scraping content and extracting links

    ////// Grab webpage ////// function webFetcher($url) { $crawl = curl_init(); curl_setopt ($crawl, CURLOPT_URL, $url); curl_setopt($crawl, CURLOPT_RETURNTRANSFER, 1); $resulting = $resulting.curl_exec($crawl); curl_close($crawl); return $result = $resulting; } //////////////////////////////// //// extract links //// function extract_links($text) { preg_match_all(‘/<\s*a[^<>]*?href=[\’”]?([^\s<>\’”]*)[\’”]?[^<>]*>(.*?)<\/a>/si’, $text, $match_array, PREG_SET_ORDER); $return = array() ; foreach ($match_array as $serp) { $full_anchor = $serp[0]; $href = $serp[1]; $anchortext =…