MongoDB, PHP

Basic Mongo Realtime Search

Please Note: This script is only designed to be for testing purposes. In reality Mongo is not really designed to do this and you would be much better off with a real search tech such as lucence/solr/elastic search/sphinx/xapian/or one of the many others out there

The main file that controls the workings of the search:

<?php
GImport::loadModule('glue.plugins.stemmer.stem');

class GModelSearch extends GModel{

	protected $term;
	protected $globals;
	protected $constraint = array(); // The constraint from the inherited search model

	protected $collection; // Define the source collection
	protected $output; // Where will the results go?
	protected $fields = array(); // What fields will be queried?

	function __get($k){
		return $this->$k;
	}

	/**
	 * Lets Search!
	 * @param $term
	 */
	function search($term = null){

		if(!$this->term) $this->term = isset($_GET['q']) ? strip_whitespace($_GET['q']) : strip_whitespace($term);

		if($this->term && !empty($this->term)){
			$rs = $this->DbCommand(array(
				"mapreduce"=>$this->collection,
				"map"=>$this->map(),
				"reduce"=>$this->reduce(),
				"scope"=>array('terms'=>$this->formGlobals($this->term)),
				"query"=>$this->formQuery($this->term),
				"out"=>session_id()."_rsearch",
				"verbose"=>true
			));
		}else{
			$rs = $this->DbCommand(array(
				"mapreduce"=>$this->collection,
				"map"=>$this->map(),
				"reduce"=>$this->reduce(),
				"scope"=>array('terms'=>null),
				"query"=>$this->constraint,
				"out"=>session_id()."_rsearch",
			));
		}
		return $this->Db($rs['result'])->find();
	}

	/**
	 * This forms the terms string to be used within the Map Reduce query.
	 * It basically forms a ASCII and non-utf8 set of strings from the main term
	 * so as to be able to search not only normalised strings but also foreign character strings
	 *
	 * @param $term
	 */
	function formQuery($term){
		$keywords = array();
		$regex_t = array();

		// Just split and stem the words as normal
		if(isset($this->constraint["\$or"])){
			$or_c = $this->constraint["\$or"];
			unset($this->constraint["\$or"]);

			for($k=0; $k<count($this->fields); $k++){
				for($j=0; $j<count($or_c); $j++){
					$keywords[] = array_merge(array($this->fields[$k]=>new MongoRegex("/".$term."/i")), $or_c[$j]);
					$keywords[] = array_merge(array($this->fields[$k]=>new MongoRegex("/".iconv('UTF-8', 'ASCII//TRANSLIT', $term)."/i")), $or_c[$j]);

					$terms = explode(" ", $term);

					for($i=0; $i<count($terms); $i++){
						$keywords[] = array_merge(array($this->fields[$k] => new MongoRegex("/".PorterStemmer::Stem(iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i]))."/i")), $or_c[$j]);
						$keywords[] = array_merge(array($this->fields[$k] => new MongoRegex("/".PorterStemmer::Stem($terms[$i])."/i")), $or_c[$j]);
						$keywords[] = array_merge(array($this->fields[$k] => new MongoRegex("/".$terms[$i]."/i")), $or_c[$j]);
						$keywords[] = array_merge(array($this->fields[$k] => new MongoRegex("/".iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i])."/i")), $or_c[$j]);
					}
				}
			}
		}else{ // TODO find out why you made this if statement, it seems irrelavent now.
			for($k=0; $k<count($this->fields); $k++){
				$keywords[] = array($this->fields[$k]=>new MongoRegex("/".$term."/i"));
				$keywords[] = array($this->fields[$k]=>new MongoRegex("/".iconv('UTF-8', 'ASCII//TRANSLIT', $term)."/i"));

				$terms = explode(" ", $term);

				for($i=0; $i<count($terms); $i++){
					$keywords[] = array($this->fields[$k] => new MongoRegex("/".PorterStemmer::Stem(iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i]))."/i"));
					$keywords[] = array($this->fields[$k] => new MongoRegex("/".PorterStemmer::Stem($terms[$i])."/i"));
					$keywords[] = array($this->fields[$k] => new MongoRegex("/".$terms[$i]."/i"));
					$keywords[] = array($this->fields[$k] => new MongoRegex("/".iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i])."/i"));
				}
			}
		}

		return array_merge(array("\$or"=>$keywords), $this->constraint); // Lets put the or statement together
	}

	/**
	 * This forms the JS globals string so I can push it into the Map Reduce to Judge relevancy
	 * @param $term
	 */
	function formGlobals($term){

		$term = strtolower($term);

		$globals[$term] = 1;
		$globals[iconv('UTF-8', 'ASCII//TRANSLIT', $term)] = 1;

		$terms = explode(" ", $term);

		for($i=0; $i<count($terms); $i++){
			$globals[PorterStemmer::Stem(iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i]))] = 1;
			$globals[PorterStemmer::Stem($terms[$i])] = 1;
			$globals[$terms[$i]] = 1;
			$globals[iconv('UTF-8', 'ASCII//TRANSLIT', $terms[$i])] = 1;
		}

		return $globals;
	}

	function map(){}

	function reduce(){
		return new MongoCode("function(key, values) {
			var docs = '';
			values.forEach ( function(val) { docs = val; })
			return docs;
		}");
	}

	function __destruct(){
		$this->drop();
	}

	/**
	 * Drop the temp Collection after
	 */
	function drop(){
		$this->Db(session_id()."_rsearch")->drop();
	}
}

And then inheriting this class is the search class itself:

<?php
GImport::push('GModelSearch', 'glue.GModelSearch');

class videoSearch extends GModelSearch{

	protected $collection = "videos"; // This is the collection in question
	protected $fields =  array("title", "t_normalised", "tags"); // These are the fields I wish to query on in the collection

	/** OH DEAR GOD **/
	function map(){
		return new MongoCode("function() {

			/**
			* Search Videos
			*
			* The rules that apply are:
			*  - Can only have either an exact or rough title match
			*  - Can only have either a rough or exact match of each tag
			*/
			var title_match_x = false, title_match_r = 0, tag_match_x = 0, tag_match_r = 0, score = 1;

			if(terms != undefined && terms != null){
				//Detect a title match
				if(terms[this.title.toLowerCase()] || terms[this.t_normalised.toLowerCase()]){
					//exact
					title_match_x = true;
				}else{
					//rough
					for(i in terms){

						var t_reg = new RegExp(i, 'i'),
							tk_reg = new RegExp(i, 'i');

						if(this.title.match(t_reg) || this.t_normalised.match(tk_reg)){
							title_match_r += 1;
						}
					}
				}

				//Detect tag matches
				if(this.tags != null && this.tags != undefined){
					for(var j=0; j<this.tags.length; j++){
						if(terms[this.tags[j].toLowerCase()]){
							tag_match_x += 1;
						}else{
							for(i in terms){
								var tg_reg = new RegExp(i, 'i');
								if(this.tags[j].match(tg_reg)){
									tag_match_r += 1;
								}
							}
						}
					}
				}

				// calculate score and emit
				if(title_match_x) score += 100;
				if(!title_match_x && title_match_r > 0) score += title_match_r*15;
				if(tag_match_x > 0 && tag_match_r <= 0) score += tag_match_x*7;
				if(tag_match_x <= 0 && tag_match_r > 0) score += tag_match_r*2;

				emit(this._id, {title:this.title, tags:this.tags, duration:this.duration, size:this.f_size, image:this.image_id, ts:this.ts, score:score, user_id:this.user_id, desc:this.desc});
			}else{
				// No term supplied just bring the document straight back out
				emit(this._id, {title:this.title, tags:this.tags, duration:this.duration, size:this.f_size, image:this.image_id, ts:this.ts, score:score, user_id:this.user_id, desc:this.desc});
			}
		}");
	}
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s