" . $_POST['action'] . "";
ini_set( 'memory_limit', '500M' );
$debug = $_POST['debugMode'];
$numTopPeopleToGetRecosFor = $_POST['numTopPeopleToGetRecosFor'];
$numTopTagsToGetRecosFor = $_POST['numTopTagsToGetRecosFor'];
$minRatingForReco = $_POST['minRatingForReco'];
$minScoreForReco = $_POST['minScoreForReco'];
$inclTvSeries = 0 == strcmp($_POST['inclTvSeries'], "on");
$inclVideoGames = 0 == strcmp($_POST['inclVideoGames'], "on");
$addedMovies = $_POST['addedMovies'];
# Note: these need to be global for the cmp function...
$seenList = array();
$recoInfo = array();
$peopleInfo = array();
$tagInfo = array();
$genreInfo = array();
$numMoviesIncluded = 0;
$numMoviesExcluded = 0;
if ($debug) $pm_time = microtime_float();
processMovies($seenList, $peopleInfo, $tagInfo, $genreInfo, $recoInfo);
if ($debug) print "Processing all movies took ". (microtime_float() - $pm_time) . "s
";
function processMovies(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo, &$recoInfo){
global $debug, $addedMovies, $numMoviesIncluded, $numMoviesExcluded;
# Prepare cache directories
if (!file_exists("./cache")) mkdir("./cache", 0777);
$lines = split("\n", $addedMovies);
foreach ($lines as $lineNum => $line) {
if ($debug) $ll_time = microtime_float();
$line = trim($line);
if (!empty($line) && (strpos($line, '#') !== 0) && !in_array($line, $seenList)) {
if (strpos($line, '-') === 0) {
$line = trim(substr($line, 1));
$seenList[] = $line;
$numMoviesExcluded++;
continue;
}
$numMoviesIncluded++;
$seenList[] = $line;
# Genre links look like
# Comedy
if ($debug) $gre_time = microtime_float();
$tmp = processPage($line, "|([\d\w-]*?)|s", "genre", $genreInfo, "main");
if ($debug) print "Processing genre for movie $line took ". (microtime_float() - $gre_time) . "s
";
if ($debug) $gre_time = microtime_float();
$genreInfo = array_merge($genreInfo, $tmp);
if ($debug) print "Adding all " . count($tmp) . " genre to real storage took ". (microtime_float() - $gre_time) . "s
";
# People links look like
#
Jake Gyllenhaal |
if ($debug) $ppl_time = microtime_float();
# To avoid people's photos, only capture name HTML enclosed in
# either or | .
# Because we want the actual name too, capture a bit more.
//$tmp = processPage($line . "fullcredits", "| | (.*?) | |s", "name", $peopleInfo, "credits");
$tmp = processPage($line . "fullcredits", "| | |s", "name", $peopleInfo, "credits");
if ($debug) print "Processing people for movie $line took ". (microtime_float() - $ppl_time) . "s
";
if ($debug) $ppl_time = microtime_float();
$peopleInfo = array_merge($peopleInfo, $tmp);
if ($debug) print "Adding all " . count($tmp) . " name to real storage took ". (microtime_float() - $ppl_time) . "s
";
# Tag links look like
# Grandma Death
if ($debug) $tag_time = microtime_float();
# The tag itself is enough; don't care about the display version.
//$tmp = processPage($line . "keywords", "|(.*?)|s", "tag", $tagInfo, "keywords");
$tmp = processPage($line . "keywords", "|";
if ($debug) $tag_time = microtime_float();
$tagInfo = array_merge($tagInfo, $tmp);
if ($debug) print "Adding all " . count($tmp) . " keyword to real storage took ". (microtime_float() - $tag_time) . "s
";
} else {
if ($debug) print "Skipping line \"$line\"
";
}
if ($debug) print "Processing movie $line took ". (microtime_float() - $ll_time) . "s
";
}
if ($debug) $ss_time = microtime_float();
$genreInfo = array_count_values($genreInfo);
if ($debug) print "Array_count_values on genreInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
$peopleInfo = array_count_values($peopleInfo);
if ($debug) print "Array_count_values on peopleInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
$tagInfo = array_count_values($tagInfo);
if ($debug) print "Array_count_values on tagInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
uksort($genreInfo, "genre_cmp");
if ($debug) print "Sorting genreInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
uksort($peopleInfo, "person_cmp");
if ($debug) print "Sorting peopleInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
uksort($tagInfo, "tag_cmp");
if ($debug) print "Sorting tagInfo took ". (microtime_float() - $ss_time) . "s
";
if ($debug) $ss_time = microtime_float();
printStats($seenList, $peopleInfo, $tagInfo, $genreInfo);
if ($debug) print "Printing stats took ". (microtime_float() - $ss_time) . "s
";
// TODO: for top 10 people and tags, get the top 10 movies.
if ($debug) $rr_time = microtime_float();
addReco($seenList, $peopleInfo, $tagInfo, $genreInfo, $recoInfo);
if ($debug) print "Adding recommendations took ". (microtime_float() - $rr_time) . "s
";
}
function addReco(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo, &$recoInfo) {
global $debug;
global $minScoreForReco, $minRatingForReco, $numTopPeopleToGetRecosFor, $numTopTagsToGetRecosFor, $inclTvSeries, $inclVideoGames;
// Process top tags...
$count = $numTopTagsToGetRecosFor;
if ($debug) print "Getting $count recos out of " . count($tagInfo) . " tags
";
foreach ($tagInfo as $key => $value) {
if ($count == 0) {
break;
}
$count--;
if ($debug) print "Getting recos for tag $key: http://www.imdb.com/keyword/$key/
";
$content = getHtml("http://www.imdb.com/keyword/$key/", "tag");
// Processes \r\n's first so they aren't converted twice.
$order = array("\r\n", "\n", "\r");
$content = str_replace($order, "", $content);
$curRating = '"';
// Top movies are coded like:
// "The Twilight Zone" (1959) |
if ($debug) $time = microtime_float();
preg_match_all("/(.*?)<.*?>\((.*?)\)<.*?user_rating\">((")|(\d+\.\d+))/s", $content, $matches);
if ($debug) print "Processing " . count($matches[0]) . " recos for tag $key took ". (microtime_float() - $time) . "s
";
if (0 == count($matches[0])) {
print "ERROR: " . count($matches[0]) . " recos for tag $key
";
}
for ($mm = 0; $mm < count($matches[0]); $mm++) {
//if ($debug) print "Matched: " . print_r($matches, true) . "
";
$url = "http://www.imdb.com" . $matches[1][$mm];
$year = $matches[3][$mm];
$title = $matches[2][$mm] . " (" . $year . ")";
$num = $matches[5][$mm] ? $matches[5][$mm] : $matches[6][$mm];
if (!$inclTvSeries && (strpos($year, "TV series") > 0 || strpos($year, "TV mini-series") > 0)) {
// Skip TV series or TV mini-series
continue;
}
if (!$inclVideoGames && strpos($year, "video game") > 0) {
// Skip video games
continue;
}
if ($num != '"') {
$curRating = $num;
}
if ($curRating < $minRatingForReco) {
// Done with this tag.
break;
}
if (!in_array($url, $seenList)) {
if (array_key_exists($url, $recoInfo)) {
$rec = $recoInfo[$url];
$rec['cnt']++;
$rec['scr']+= $value;
if (false === strpos($rec['tag'], ",".$key.",")) {
$rec['tag'] .= $key . ",";
}
// For some reason we need to set this record again...
$recoInfo[$url] = $rec;
if ($debug) print "Increasing count for url $url, title $title, rating $curRating because of tag $key: " . print_r($rec, true) . "
";
} else {
$rec = array();
$rec['nm'] = $title;
$rec['rat'] = $curRating;
$rec['cnt'] = 1; // # times recommended
$rec['scr'] = $value; // tag frequency
$rec['tag'] = "," . $key . ",";
$recoInfo[$url] = $rec;
if ($debug) print "Adding url $url, title $title, rating $curRating because of tag $key: " . print_r($rec, true) . "
";
}
}
}
}
print "\n----- RECOMMENDATIONS (" . count($recoInfo) . " total) -----
\n";
if ($debug) $ss_time = microtime_float();
uksort($recoInfo, "reco_cmp");
if ($debug) print "Sorting recommendations took ". (microtime_float() - $ss_time) . "s
";
$max1 = 10;
$limitMsg = "Note: only the first $max1 recommendations with count 1 were printed.\n";
if ($debug) $pp_time = microtime_float();
foreach ($recoInfo as $url => $rec) {
if ($max1 > 0 && $rec['scr'] >= $minScoreForReco) {
print "" . $rec['nm'] . ", rated " . $rec['rat'] . ", scored " . $rec['scr'] . ", recommended " . $rec['cnt'] . " times because of tags: " . $rec['tag'] . "
\n";
} else if ($rec['scr'] < $minScoreForReco) {
// List is ordered by score, so we're done.
break;
} else {
print $limitMsg;
break;
}
if ($rec['cnt'] == 1) {
$max1--;
}
}
if ($debug) print "Printing recommendations took ". (microtime_float() - $pp_time) . "s
";
}
function processPage($url, $regex, $type, &$storage, $cacheDir) {
global $debug;
if ($debug) print "Getting url $url
";
$content = getHtml($url, $cacheDir);
if ($debug) $time = microtime_float();
preg_match_all($regex, $content, $matches);
if ($debug) print "Processing " . count($matches[0]) . " $type took ". (microtime_float() - $time) . "s
";
if ($debug) $time = microtime_float();
$tmp = array_unique($matches[1]);
if ($debug) print "Adding all " . count($tmp) . " $type to tmp storage took ". (microtime_float() - $time) . "s
";
return $tmp;
}
function genre_cmp($a, $b) {
global $genreInfo;
$a = $genreInfo[$a];
$b = $genreInfo[$b];
return cmp($a, $b);
}
function person_cmp($a, $b) {
global $peopleInfo;
$a = $peopleInfo[$a];
$b = $peopleInfo[$b];
return cmp($a, $b);
}
function tag_cmp($a, $b) {
global $tagInfo;
$a = $tagInfo[$a];
$b = $tagInfo[$b];
return cmp($a, $b);
}
function reco_cmp($a, $b) {
global $recoInfo;
global $tmp;
// Sort by total score, not #times recommended
//$aa = $recoInfo[$a]['cnt'];
//$bb = $recoInfo[$b]['cnt'];
$aa = $recoInfo[$a]['scr'];
$bb = $recoInfo[$b]['scr'];
if ($aa == $bb) {
$aa = $recoInfo[$a]['rat'];
$bb = $recoInfo[$b]['rat'];
}
return cmp($aa, $bb);
}
function cmp($a, $b) {
if ($a < $b) return 1;
if ($a > $b) return -1;
return 0;
}
function printStats(&$seenList, &$peopleInfo, &$tagInfo, &$genreInfo) {
global $numMoviesIncluded, $numMoviesExcluded;
print "----- FILMS (" . $numMoviesIncluded . " used for recommendations; " .
$numMoviesExcluded . " excluded) -----
\n";
print "\n";
print "\n";
print "\n";
}
# Instead of passing "special" flag, do something that works in all cases,
# like passing a regex.
function printInfo(&$storage, $urlPrefix, $special) {
$max1 = 10;
$limitMsg = "Note: only the first $max1 records with count 1 were printed.\n";
foreach ($storage as $key => $value) {
if ($max1 > 0) {
if ($special) {
print ": $value\n";
} else {
print "$key: $value\n";
}
} else {
print $limitMsg;
return;
}
if ($value == 1) {
$max1--;
}
}
}
function existsInCache($cacheDir, $file) {
$cacheDir = "./cache/$cacheDir";
$file = str_replace("/", "%2F", $file);
# TODO: add some time-based expiration of cache here
return file_exists($cacheDir . "/" . $file);
}
function addToCache($cacheDir, $file, $contents) {
$cacheDir = "./cache/$cacheDir";
$file = str_replace("/", "%2F", $file);
if (!file_exists($cacheDir)) mkdir($cacheDir, 0777);
if (!$handle = fopen($cacheDir . "/" . $file, 'w')) {
echo "Cannot open file ($cacheDir/$file)";
exit;
}
if (fwrite($handle, $contents) === FALSE) {
echo "Cannot write to file ($cacheDir/$file)";
exit;
}
fclose($handle);
}
function getFromCache($cacheDir, $file) {
$cacheDir = "./cache/$cacheDir";
$file = str_replace("/", "%2F", $file);
return file_get_contents($cacheDir . "/" . $file);
}
function getHtml($url, $cacheDir) {
global $debug;
if ($debug) $gh_time = microtime_float();
if (existsInCache($cacheDir, $url)) {
if ($debug) print "CACHE: get from cache: $url
";
$result = getFromCache($cacheDir, $url);
} else {
// The following implementation is not adviced due to security concerns.
//return file_get_contents($url);
// For PHPs compiled with cURL support, this is the preferred method.
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 0);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec ($ch);
curl_close ($ch);
if ($debug) print "CACHE: add to cache: $url
";
addToCache($cacheDir, $url, $result);
}
if ($debug) print "Getting html for $url took ". (microtime_float() - $gh_time) . "s
";
return $result;
}
function microtime_float()
{
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
?>