#!/usr/bin/php
<?php

/*

blockfindkeyterms v 1.1

by The Crisses

No warranties.  I am not responsible for you damaging your computer through the use of this software, nor am I getting any money from you if it picks out the winning lottery numbers.  You're on your own as far as that goes.

This script is open source -- I don't really care what license.  Use it however you need to.  Email crisses@kinhost.org for help with the script, requested features, comments, questions, and crisses-paypal@otherkin.net with paypal donations ;)

Good luck!

Crisses


Usage:
./blockfindkeyterms textfilecontents.txt

Suggested usage as a pipeline:
./blockfindkeyterms textfilecontents.txt | sed 's/^/block:/g' > outputblocklisttext.txt


This script takes a filename as a line command argument.  It expects a textfille with the body containing hyperlinks and links to wikispammer's sites.

The script will send a listing of terms to stdout that can be piped into other programs for further processing.


DO NOT just add this list straight to the blocklist.  Go through the list and make sure the terms are ones you want to block, strip out redundant tags, check in case posts that were legitimate were blocked by accident, etc.  

You want a slimmer, not fatter, blocklist unless your server can really handle it, and eliminating something like "server.com" would also block posts to "linuxserver.com" which might be legitimate.  Choose items with some care.


*/

/********* User Configured Variables *************/


// optional ignore lines that contain a plain string (faster, preferred)
$IgnoreInputContent = array();

// optional ignore lines containing regexes (slower, advanced, strict, complex) use perl-compatible regexes enclosed in '/regex/'
$IgnoreInputRegex = array();

// a regex you would like stripped (replaced with "") throughout, before processing
$StripInputRegex = array();

// a regex you would like stripped (replaced with "") throughout, after processing
// This example, strips out ending items that are only word.html if the "word" is under 
// 6 characters long.  This leaves off a lot of smaller words you won't want to block.
$StripOutputRegex = array('/^[A-Za-z]{1,6}\.html/');

// optional ignore lines that contain a plain string (faster, preferred)
$IgnoreOutputContent = array();

// optional ignore output lines containing regexes (slower, advanced, strict, complex) use perl-compatible regexes enclosed in '/regex/'
$IgnoreOutputRegex = array();


/********* Do Not Alter Below *************/


/* This one is pretty complex, and has some odd theory behind it.

First you take the raw input -- usually a load of links.

The easy things to search on are the hyphenated terms that proliferate, and the domain-names.

Finds any number of dash(-) or dot(.) separated words, and filters out chosen terms, checks for duplicates, etc.  These are passed out through standard output for further processing.
*/

// to streamline, create a flag for current script status -- currently processing input
$status = "input";

// read in the file specified on the command line, as an array
$originalcontent = file($argv[1]);

// debug line
//echo "Total original lines is " . count($originalcontent) . "\n";

// pass original content through filters, as specified by user variables above.  May work on streamlining this further later.
$secondarycontent = array_filter($originalcontent,"noignorecontent");
$blockedcontent = array_filter($secondarycontent,"noignoreregex");
array_walk($blockedcontent, "stripregex");

// define the regexes needed to parse dot & dash content -- can be greatly streamlined later
$dashregex = '([a-zA-Z]+(-+[a-zA-Z]+)+)';
$dotregex = '([a-zA-Z]+(\.[a-zA-Z]+)+)';

//debug line
//echo "Total lines is " . count($blockedcontent) . "\n";

// remove duplicates 
$blockedcontent = array_unique($blockedcontent);

//debug line
//echo "Total lines take 2 is " . count($blockedcontent) . "\n";

// yank out the content we're looking for
foreach ($blockedcontent as $line) {
	preg_match_all ("/$dotregex/", $line, $matches);
	$hasdots = array_merge($hasdots, $matches[1]);
	preg_match_all ("/$dashregex/", $line, $matched);
	$hasdashes = array_merge($hasdashes, $matched[1]);
}

//debug line
//echo "Stage1: Dots is " . count($hasdots) . ".  Dash is " . count($hasdashes) . ".\n";

// remove duplicates
$hasdotunique = array_unique($hasdots);
$hasdashunique = array_unique($hasdashes);

// debug line
//echo "Stage2: Dots is " . count($hasdotunique) . ".  Dash is " . count($hasdashunique) . ".\n";

// Entering the stage where output is being created -- change the flag to signal that we're looking for output filters
$status = "output";

// runn arrays through filters to weed the output
$filterdot = array_filter($hasdotunique,"noignorecontent");
$filterdotresults = array_filter($filterdot,"noignoreregex");

$filterdash = array_filter($hasdashunique,"noignorecontent");
$filterdashresults = array_filter($filterdash,"noignoreregex");
array_walk ($filterdotresults, "stripregex");
array_walk ($filterdashresults, "stripregex");

// clean up by removing blank entries from the arrays
$rawdot = array_clean($filterdotresults);
$rawdash = array_clean($filterdashresults);

//debug line
//echo "Stage3: Dots is " . count($rawdot) . ".  Dash is " . count($rawdash) . ".\n";

// create flat variables for outputting
$dotoutput = implode("\n", $rawdot);
$dashoutput = implode("\n", $rawdash);

// output the parsed information
echo "$dotoutput
$dashoutput";

// This function says "false" when a line has content to ignore
function noignorecontent($line){
	// pull in and check status (is this input or output being parsed?)
	global $status;
	// choose input or output parse array
	if ($status != "output"){
		global $IgnoreInputContent;
	 	$content = $IgnoreInputContent;
	} else {
		global $IgnoreOutputContent;
	 	$content = $IgnoreOutputContent;
	} // end if status

		// if there are items to ignore, parse the line
		if (count($content)) {
		// get one item of ignore content and check if the line contains it
		foreach ($content as $test){
			// if the line has ignored content, remove it from the output array
			if(strstr($line, $test)) return false;
		} // end if foreach
		} // end if count

	// if the function gets here, then there was no ignored content in the line; it's a keeper
	return true;
} // end function

// This function returns false when the line matches a regex to ignore
function noignoreregex($line){
	// pull in and check status -- is this input or output being parsed
	global $IgnoreInputRegex, $IgnoreOutputRegex, $status;
	
	// choose input or output parse array
	if ($status != "output") {
		$regex = $IgnoreInputRegex;
	} else {
		$regex = $IgnoreOutputRegex;	
	} // end if status

	//if there are expressions to parse for, parse the line
	if ((count($regex)) > 0){
	// get one regex to parse with, and check whether the line matches
	foreach ($regex as $test){
		//if the line matches, remove it from the output array
		if(preg_match($test, $line)) return false;
	} // end foreach
	} // end if count
	// if the function gets here, the line did not match any of the regexes; it's a keeper
	return true;
} // end function

// takes regular expressions and replaces them with nothing.  Will replace lines of arrays walked through it.
function stripregex(&$line){
	// get our friendly globals
	global $StripInputRegex, $StripOutputRegex, $status;
	// test whether we're in the input or output phase
	if ($status != "output") {
		$regex = $StripInputRegex;
	} else {
		$regex = $StripOutputRegex;	
	} // end if status
	// if there are members in the appropriate array
	if (count($regex)){
		// take each regex
		foreach ($regex as $test){
			//replace the line with the regex text stripped out
			$line = preg_replace($test, '', $line);
		} // end foreach
	} // end if count

} // end function

// this is never called.  What's it doing here?  Leaving it because I can reuse it.
// removed code earlier that would compare output with a blocklist file
function stripblock($inarray){
	foreach ($inarray as $myline) {
		if (stristr($myline, "block:")) {
			$workarray = explode("block:", $myline);
			foreach ($workarray as $blockline){
				$blockline = trim($blockline);
				if (!preg_match("/[A-Za-z]+/",$blockline)) continue;
				$outarray[] = $blockline;
			}
		}
	}
	return $outarray;
}

// this function will output a cleaned-up array.  $delete is passed text to delete (not used here).
// casesensitivity is also not used here but included with the function.
function array_clean ($input, $delete = false, $caseSensitive = false) { 
	$i = 0; 
	while($i < count($input)) { 
		if($delete) { 
			if($caseSensitive) { 
				if(!strstr($input[$i] ,$delete)) { 
					$return[] = $input[$i]; 
				} 
			} else 	{ 
				if(!stristr($input[$i], $delete)) { 
				$return[] = $input[$i]; 
				} 
			} 
		} else 	{ 
			if(!empty($input[$i])) { 
				$return[] = $input[$i]; 
			} 
		} 
	$i++; 
	} 
	return $return; 
} 

/* Change log

1.2 -- commented.
1.1 -- forgot to comment out debugging lines.  Fixed & uploaded.  Working on commenting the text better.

*/

?>