" . $_POST['action'] . ""; ini_set( 'memory_limit', '500M' ); $debug = $_POST['debugMode']; $numTopPeopleToGetRecosFor = $_POST['numTopPeopleToGetRecosFor']; $numTopTagsToGetRecosFor = $_POST['numTopTagsToGetRecosFor']; $minRatingForReco = $_POST['minRatingForReco']; $minScoreForReco = $_POST['minScoreForReco']; $inclTvSeries = 0 == strcmp($_POST['inclTvSeries'], "on"); $inclVideoGames = 0 == strcmp($_POST['inclVideoGames'], "on"); $addedMovies = $_POST['addedMovies']; # Note: these need to be global for the cmp function... $seenList = array(); $recoInfo = array(); $peopleInfo = array(); $tagInfo = array(); $genreInfo = array(); $numMoviesIncluded = 0; $numMoviesExcluded = 0; if ($debug) $pm_time = microtime_float(); processMovies($seenList, $peopleInfo, $tagInfo, $genreInfo, $recoInfo); if ($debug) print "Processing all movies took ". (microtime_float() - $pm_time) . "s
"; function processMovies(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo, &$recoInfo){ global $debug, $addedMovies, $numMoviesIncluded, $numMoviesExcluded; # Prepare cache directories if (!file_exists("./cache")) mkdir("./cache", 0777); $lines = split("\n", $addedMovies); foreach ($lines as $lineNum => $line) { if ($debug) $ll_time = microtime_float(); $line = trim($line); if (!empty($line) && (strpos($line, '#') !== 0) && !in_array($line, $seenList)) { if (strpos($line, '-') === 0) { $line = trim(substr($line, 1)); $seenList[] = $line; $numMoviesExcluded++; continue; } $numMoviesIncluded++; $seenList[] = $line; # Genre links look like # Comedy if ($debug) $gre_time = microtime_float(); $tmp = processPage($line, "|([\d\w-]*?)|s", "genre", $genreInfo, "main"); if ($debug) print "Processing genre for movie $line took ". (microtime_float() - $gre_time) . "s
"; if ($debug) $gre_time = microtime_float(); $genreInfo = array_merge($genreInfo, $tmp); if ($debug) print "Adding all " . count($tmp) . " genre to real storage took ". (microtime_float() - $gre_time) . "s
"; # People links look like # Jake Gyllenhaal if ($debug) $ppl_time = microtime_float(); # To avoid people's photos, only capture name HTML enclosed in # either or . # Because we want the actual name too, capture a bit more. //$tmp = processPage($line . "fullcredits", "|(.*?)|s", "name", $peopleInfo, "credits"); $tmp = processPage($line . "fullcredits", "||s", "name", $peopleInfo, "credits"); if ($debug) print "Processing people for movie $line took ". (microtime_float() - $ppl_time) . "s
"; if ($debug) $ppl_time = microtime_float(); $peopleInfo = array_merge($peopleInfo, $tmp); if ($debug) print "Adding all " . count($tmp) . " name to real storage took ". (microtime_float() - $ppl_time) . "s
"; # Tag links look like #
Grandma Death if ($debug) $tag_time = microtime_float(); # The tag itself is enough; don't care about the display version. //$tmp = processPage($line . "keywords", "|(.*?)|s", "tag", $tagInfo, "keywords"); $tmp = processPage($line . "keywords", "|"; if ($debug) $tag_time = microtime_float(); $tagInfo = array_merge($tagInfo, $tmp); if ($debug) print "Adding all " . count($tmp) . " keyword to real storage took ". (microtime_float() - $tag_time) . "s
"; } else { if ($debug) print "Skipping line \"$line\"
"; } if ($debug) print "Processing movie $line took ". (microtime_float() - $ll_time) . "s
"; } if ($debug) $ss_time = microtime_float(); $genreInfo = array_count_values($genreInfo); if ($debug) print "Array_count_values on genreInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); $peopleInfo = array_count_values($peopleInfo); if ($debug) print "Array_count_values on peopleInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); $tagInfo = array_count_values($tagInfo); if ($debug) print "Array_count_values on tagInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); uksort($genreInfo, "genre_cmp"); if ($debug) print "Sorting genreInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); uksort($peopleInfo, "person_cmp"); if ($debug) print "Sorting peopleInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); uksort($tagInfo, "tag_cmp"); if ($debug) print "Sorting tagInfo took ". (microtime_float() - $ss_time) . "s
"; if ($debug) $ss_time = microtime_float(); printStats($seenList, $peopleInfo, $tagInfo, $genreInfo); if ($debug) print "Printing stats took ". (microtime_float() - $ss_time) . "s
"; // TODO: for top 10 people and tags, get the top 10 movies. if ($debug) $rr_time = microtime_float(); addReco($seenList, $peopleInfo, $tagInfo, $genreInfo, $recoInfo); if ($debug) print "Adding recommendations took ". (microtime_float() - $rr_time) . "s
"; } function addReco(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo, &$recoInfo) { global $debug; global $minScoreForReco, $minRatingForReco, $numTopPeopleToGetRecosFor, $numTopTagsToGetRecosFor, $inclTvSeries, $inclVideoGames; // Process top tags... $count = $numTopTagsToGetRecosFor; if ($debug) print "Getting $count recos out of " . count($tagInfo) . " tags
"; foreach ($tagInfo as $key => $value) { if ($count == 0) { break; } $count--; if ($debug) print "Getting recos for tag $key: http://www.imdb.com/keyword/$key/
"; $content = getHtml("http://www.imdb.com/keyword/$key/", "tag"); // Processes \r\n's first so they aren't converted twice. $order = array("\r\n", "\n", "\r"); $content = str_replace($order, "", $content); $curRating = '"'; // Top movies are coded like: //
"The Twilight Zone" (1959) if ($debug) $time = microtime_float(); preg_match_all("/(.*?)<.*?>\((.*?)\)<.*?user_rating\">((")|(\d+\.\d+))/s", $content, $matches); if ($debug) print "Processing " . count($matches[0]) . " recos for tag $key took ". (microtime_float() - $time) . "s
"; if (0 == count($matches[0])) { print "ERROR: " . count($matches[0]) . " recos for tag $key
"; } for ($mm = 0; $mm < count($matches[0]); $mm++) { //if ($debug) print "Matched: " . print_r($matches, true) . "
"; $url = "http://www.imdb.com" . $matches[1][$mm]; $year = $matches[3][$mm]; $title = $matches[2][$mm] . " (" . $year . ")"; $num = $matches[5][$mm] ? $matches[5][$mm] : $matches[6][$mm]; if (!$inclTvSeries && (strpos($year, "TV series") > 0 || strpos($year, "TV mini-series") > 0)) { // Skip TV series or TV mini-series continue; } if (!$inclVideoGames && strpos($year, "video game") > 0) { // Skip video games continue; } if ($num != '"') { $curRating = $num; } if ($curRating < $minRatingForReco) { // Done with this tag. break; } if (!in_array($url, $seenList)) { if (array_key_exists($url, $recoInfo)) { $rec = $recoInfo[$url]; $rec['cnt']++; $rec['scr']+= $value; if (false === strpos($rec['tag'], ",".$key.",")) { $rec['tag'] .= $key . ","; } // For some reason we need to set this record again... $recoInfo[$url] = $rec; if ($debug) print "Increasing count for url $url, title $title, rating $curRating because of tag $key: " . print_r($rec, true) . "
"; } else { $rec = array(); $rec['nm'] = $title; $rec['rat'] = $curRating; $rec['cnt'] = 1; // # times recommended $rec['scr'] = $value; // tag frequency $rec['tag'] = "," . $key . ","; $recoInfo[$url] = $rec; if ($debug) print "Adding url $url, title $title, rating $curRating because of tag $key: " . print_r($rec, true) . "
"; } } } } print "\n

----- RECOMMENDATIONS (" . count($recoInfo) . " total) -----

\n"; if ($debug) $ss_time = microtime_float(); uksort($recoInfo, "reco_cmp"); if ($debug) print "Sorting recommendations took ". (microtime_float() - $ss_time) . "s
"; $max1 = 10; $limitMsg = "Note: only the first $max1 recommendations with count 1 were printed.\n"; if ($debug) $pp_time = microtime_float(); foreach ($recoInfo as $url => $rec) { if ($max1 > 0 && $rec['scr'] >= $minScoreForReco) { print "" . $rec['nm'] . ", rated " . $rec['rat'] . ", scored " . $rec['scr'] . ", recommended " . $rec['cnt'] . " times because of tags: " . $rec['tag'] . "
\n"; } else if ($rec['scr'] < $minScoreForReco) { // List is ordered by score, so we're done. break; } else { print $limitMsg; break; } if ($rec['cnt'] == 1) { $max1--; } } if ($debug) print "Printing recommendations took ". (microtime_float() - $pp_time) . "s
"; } function processPage($url, $regex, $type, &$storage, $cacheDir) { global $debug; if ($debug) print "Getting url $url
"; $content = getHtml($url, $cacheDir); if ($debug) $time = microtime_float(); preg_match_all($regex, $content, $matches); if ($debug) print "Processing " . count($matches[0]) . " $type took ". (microtime_float() - $time) . "s
"; if ($debug) $time = microtime_float(); $tmp = array_unique($matches[1]); if ($debug) print "Adding all " . count($tmp) . " $type to tmp storage took ". (microtime_float() - $time) . "s
"; return $tmp; } function genre_cmp($a, $b) { global $genreInfo; $a = $genreInfo[$a]; $b = $genreInfo[$b]; return cmp($a, $b); } function person_cmp($a, $b) { global $peopleInfo; $a = $peopleInfo[$a]; $b = $peopleInfo[$b]; return cmp($a, $b); } function tag_cmp($a, $b) { global $tagInfo; $a = $tagInfo[$a]; $b = $tagInfo[$b]; return cmp($a, $b); } function reco_cmp($a, $b) { global $recoInfo; global $tmp; // Sort by total score, not #times recommended //$aa = $recoInfo[$a]['cnt']; //$bb = $recoInfo[$b]['cnt']; $aa = $recoInfo[$a]['scr']; $bb = $recoInfo[$b]['scr']; if ($aa == $bb) { $aa = $recoInfo[$a]['rat']; $bb = $recoInfo[$b]['rat']; } return cmp($aa, $bb); } function cmp($a, $b) { if ($a < $b) return 1; if ($a > $b) return -1; return 0; } function printStats(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo) { global $numMoviesIncluded, $numMoviesExcluded; print "

----- FILMS (" . $numMoviesIncluded . " used for recommendations; " . $numMoviesExcluded . " excluded) -----

\n"; print "

\n"; print "

\n"; print "

\n"; } # Instead of passing "special" flag, do something that works in all cases, # like passing a regex. function printInfo(&$storage, $urlPrefix, $special) { $max1 = 10; $limitMsg = "Note: only the first $max1 records with count 1 were printed.\n"; foreach ($storage as $key => $value) { if ($max1 > 0) { if ($special) { print ": $value\n"; } else { print "$key: $value\n"; } } else { print $limitMsg; return; } if ($value == 1) { $max1--; } } } function existsInCache($cacheDir, $file) { $cacheDir = "./cache/$cacheDir"; $file = str_replace("/", "%2F", $file); # TODO: add some time-based expiration of cache here return file_exists($cacheDir . "/" . $file); } function addToCache($cacheDir, $file, $contents) { $cacheDir = "./cache/$cacheDir"; $file = str_replace("/", "%2F", $file); if (!file_exists($cacheDir)) mkdir($cacheDir, 0777); if (!$handle = fopen($cacheDir . "/" . $file, 'w')) { echo "Cannot open file ($cacheDir/$file)"; exit; } if (fwrite($handle, $contents) === FALSE) { echo "Cannot write to file ($cacheDir/$file)"; exit; } fclose($handle); } function getFromCache($cacheDir, $file) { $cacheDir = "./cache/$cacheDir"; $file = str_replace("/", "%2F", $file); return file_get_contents($cacheDir . "/" . $file); } function getHtml($url, $cacheDir) { global $debug; if ($debug) $gh_time = microtime_float(); if (existsInCache($cacheDir, $url)) { if ($debug) print "CACHE: get from cache: $url
"; $result = getFromCache($cacheDir, $url); } else { // The following implementation is not adviced due to security concerns. //return file_get_contents($url); // For PHPs compiled with cURL support, this is the preferred method. $ch = curl_init(); curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt ($ch, CURLOPT_HEADER, 0); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec ($ch); curl_close ($ch); if ($debug) print "CACHE: add to cache: $url
"; addToCache($cacheDir, $url, $result); } if ($debug) print "Getting html for $url took ". (microtime_float() - $gh_time) . "s
"; return $result; } function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } ?>