<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Job tech blog</title>
	<atom:link href="http://findmeajob.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://findmeajob.wordpress.com</link>
	<description>Just another WordPress.com weblog</description>
	<lastBuildDate>Fri, 07 Mar 2008 06:44:10 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='findmeajob.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Job tech blog</title>
		<link>http://findmeajob.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://findmeajob.wordpress.com/osd.xml" title="Job tech blog" />
	<atom:link rel='hub' href='http://findmeajob.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Bonjour to distribute search engine traffic</title>
		<link>http://findmeajob.wordpress.com/2008/03/05/bonjour-to-distribute-search-engine-traffic/</link>
		<comments>http://findmeajob.wordpress.com/2008/03/05/bonjour-to-distribute-search-engine-traffic/#comments</comments>
		<pubDate>Wed, 05 Mar 2008 18:48:50 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[backend]]></category>
		<category><![CDATA[index]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>
		<category><![CDATA[announce]]></category>
		<category><![CDATA[apple]]></category>
		<category><![CDATA[avahi]]></category>
		<category><![CDATA[bonjour]]></category>
		<category><![CDATA[discovery]]></category>
		<category><![CDATA[linux]]></category>
		<category><![CDATA[p2p]]></category>
		<category><![CDATA[platform]]></category>
		<category><![CDATA[search engine]]></category>
		<category><![CDATA[search platform]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/?p=20</guid>
		<description><![CDATA[There are hundred different ways of distributing search traffic over a farm of search engines. From hard-coded configuration to multi-casted message-bus. If not hard to implement, they are hard to understand for someone starting out with search-engines. But not so with bonjour, which is P2P service announcement of sorts. Assuming bonjour server/client you are using [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=20&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>There are hundred different ways of distributing search traffic over a farm of search engines. From hard-coded configuration to multi-casted message-bus. If not hard to implement, they are hard to understand for someone starting out with search-engines. But not so with bonjour, which is P2P service announcement of sorts. Assuming bonjour server/client you are using is avahi, just run avahi-browse to find what services are running in current network. Offcourse, not great if farm spans more than one network. But its so easy. I am surprised, its so easy to announce and discover. </p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/20/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/20/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/20/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/20/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/20/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=20&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2008/03/05/bonjour-to-distribute-search-engine-traffic/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>Presenting at OSIW</title>
		<link>http://findmeajob.wordpress.com/2008/02/18/presenting-at-osiw/</link>
		<comments>http://findmeajob.wordpress.com/2008/02/18/presenting-at-osiw/#comments</comments>
		<pubDate>Mon, 18 Feb 2008 12:44:18 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[Free Text Search]]></category>
		<category><![CDATA[keyword search]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[User interface]]></category>
		<category><![CDATA[vertical search]]></category>
		<category><![CDATA[OSIW]]></category>
		<category><![CDATA[presentation]]></category>
		<category><![CDATA[talk]]></category>
		<category><![CDATA[xapian]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/?p=18</guid>
		<description><![CDATA[Its easy to think up something and keep stroking the thought to conclusion which looks aesthetically proper to you. Its entirely different game to actually convince someone else of your argument. But public speaking is like a barometer what you think and believe. These thoughts relate to the fact that I gave a talk at [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=18&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Its easy to think up something and keep stroking the thought to conclusion which looks aesthetically proper to you. Its entirely different game to actually convince someone else of your argument. But public speaking is like a barometer what you think and believe. These thoughts relate to the fact that I gave a talk at OSIW about search engines. </p>
<p>The talk was controversially called &#8216;Who is scared of google?&#8217;, with a sincere believe that search engines are going to evolve and stop where they are right now. Also that one could come up with specialized search engines using FOSS tools. The presentation available <a href='http://findmeajob.files.wordpress.com/2008/02/search_osiw.pdf' title='Who is scared of Google?'>here</a></p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/18/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/18/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/18/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/18/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/18/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=18&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2008/02/18/presenting-at-osiw/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>Fancy is out, darn boring is in</title>
		<link>http://findmeajob.wordpress.com/2007/10/05/mental-models-for-search-are-getting-firmer/</link>
		<comments>http://findmeajob.wordpress.com/2007/10/05/mental-models-for-search-are-getting-firmer/#comments</comments>
		<pubDate>Fri, 05 Oct 2007 03:06:48 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[Free Text Search]]></category>
		<category><![CDATA[keyword search]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[User interface]]></category>
		<category><![CDATA[vertical search]]></category>
		<category><![CDATA[web2.0]]></category>
		<category><![CDATA[front end]]></category>
		<category><![CDATA[search engine]]></category>
		<category><![CDATA[web layout]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/10/05/mental-models-for-search-are-getting-firmer/</guid>
		<description><![CDATA[Verdict is in. People want &#8216;normal&#8217; looking search engine. A search engine invokes a mental map which is getting re-enforced in our mind. Even separating out certain search results in a box entails a risk of users overlooking those links assuming it to be ads. In his article Jacob Neilson warns against trying to change [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=16&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Verdict is in. People want &#8216;normal&#8217; looking search engine. A search engine invokes a mental map which is getting re-enforced in our mind. Even separating out certain search results in a box entails a risk of users overlooking those links assuming it to be ads.</p>
<p>In his article Jacob Neilson warns against trying to change the search user interface. This argues that search engines should not try to distinguish themselves with fancy front ends.<br />
Article available <a href="http://www.useit.com/alertbox/20050509.html">here</a> </p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/16/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/16/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/16/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/16/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/16/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=16&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/10/05/mental-models-for-search-are-getting-firmer/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>Grapeshot and small footprint</title>
		<link>http://findmeajob.wordpress.com/2007/08/29/grapeshot-and-small-footprint/</link>
		<comments>http://findmeajob.wordpress.com/2007/08/29/grapeshot-and-small-footprint/#comments</comments>
		<pubDate>Wed, 29 Aug 2007 04:35:06 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[apache]]></category>
		<category><![CDATA[grapeshot]]></category>
		<category><![CDATA[lucene]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/08/29/grapeshot-and-small-footprint/</guid>
		<description><![CDATA[Grapeshot blitz Grapeshot is a SDK providing advanced concept-based bayesian search methods for developers to insert &#8220;implicit search&#8221; capabilities inside application. In plain english, a promising search engine library for developers. The technology section summarizes various aspects of the library which puts it apart from other similar projects. Some interesting features are: Document clustering Sentences [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=13&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><strong>Grapeshot blitz</strong><br />
<a href="http://www.grapeshot.co.uk">Grapeshot</a> is a SDK providing advanced concept-based bayesian search methods for developers to insert &#8220;implicit search&#8221; capabilities inside application. In plain english, a promising search engine library for developers. </p>
<p>The technology section summarizes various aspects of the library which puts it apart from other similar projects. Some interesting features are:</p>
<ul>
<li>Document clustering</li>
<li>Sentences or paragraphs can be used as queries</li>
<li>Word ranking</li>
</ul>
<p>One <a href="http://www.grapeshot.co.uk/shared/small-footprint.php">feature</a> that has been highlighted is its small footprint. Grapeshot claims to be 300K binary.<br />
<img src='http://findmeajob.files.wordpress.com/2007/08/software-footprint.gif?w=510' alt='small footprint' /><br />
The bar graph shows, what grapeshot claims to be sizes of binaries for various similar software libraries. The footprint of lucene specifically is of interest. Unlike claimed by the site 11+MB, <a href="http://lucene.apache.org/">lucene</a> core jar file as of 2.2.0 version is about 526K only. Which could also be reduced depending on the users requirement.</p>
<p><strong>Reducing binary footprint of lucene</strong><br />
Although 526K doesn&#8217;t seem like a large footprint. As an exercise, one can reduce it for embedded or mobile device like grapeshot claims. To reduce binary size:</p>
<ul>
<li> Run the java application of interest with <strong>-verbose:class</strong> flag. This produces verbose output of class loading details on stdout</li>
<li> Run the output through <strong><br />
<code>cat * |grep lucene-core|cut -f2 -d' '|uniq|tr '.' '/'| awk '{printf "%s.class\n", $1}'</code><br />
</strong> command. This will filter out all the classes from lucene library loaded at runtime</li>
<li>Create a custom jar file by deleting all .class files which are not in the list.</li>
</ul>
<p>Following this procedure for demo application bundled with lucene core binary, custom jar was reduced by half to 262k. Less than Grapeshot binary.</p>
<p>As side note <a href="http://rafb.net/p/qsLWfm76.html">this</a> python script can be used to deleted files from extracted jar.</p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/13/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/13/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/13/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/13/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/13/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=13&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/08/29/grapeshot-and-small-footprint/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>

		<media:content url="http://findmeajob.files.wordpress.com/2007/08/software-footprint.gif" medium="image">
			<media:title type="html">small footprint</media:title>
		</media:content>
	</item>
		<item>
		<title>Jython 2.2 released!</title>
		<link>http://findmeajob.wordpress.com/2007/08/24/jython-22-released/</link>
		<comments>http://findmeajob.wordpress.com/2007/08/24/jython-22-released/#comments</comments>
		<pubDate>Fri, 24 Aug 2007 13:40:40 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[index]]></category>
		<category><![CDATA[jython]]></category>
		<category><![CDATA[lucene]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/08/24/jython-22-released/</guid>
		<description><![CDATA[Jython 2.2 released!! Woohoo!!. Jython is a great tool for introspection of lucene indices with full-fledged programming language backing.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=11&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Jython 2.2 released!! Woohoo!!.</p>
<p>Jython is a great tool for introspection of lucene indices with full-fledged programming language backing.  </p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/11/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/11/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/11/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/11/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/11/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=11&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/08/24/jython-22-released/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>Improving indexing performance</title>
		<link>http://findmeajob.wordpress.com/2007/08/24/improving-indexing-performance/</link>
		<comments>http://findmeajob.wordpress.com/2007/08/24/improving-indexing-performance/#comments</comments>
		<pubDate>Fri, 24 Aug 2007 11:17:00 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[apache]]></category>
		<category><![CDATA[fields]]></category>
		<category><![CDATA[Free Text Search]]></category>
		<category><![CDATA[index]]></category>
		<category><![CDATA[keyword search]]></category>
		<category><![CDATA[lucene]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/08/24/improving-indexing-performance/</guid>
		<description><![CDATA[Reading through lucene wiki, I came across a nice list of things to try for improving indexing performance. I am listing some of the most striking ones from the page Flush by RAM usage instead of document count. Call writer.ramSizeInBytes() after every added doc then call flush() when it&#8217;s using too much RAM. This is [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=10&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Reading through lucene wiki, I came across a nice list of things to try for improving indexing performance. I am listing some of the most striking ones from the page</p>
<ul>
<li><strong>Flush by RAM usage instead of document count.</strong><br />
Call <em>writer.ramSizeInBytes()</em> after every added doc then call <em>flush()</em> when it&#8217;s using too much RAM. This is especially good if you have small docs or highly variable doc sizes. You need to first set <em>maxBufferedDocs</em> large enough to prevent the writer from flushing based on document count. However, don&#8217;t set it too large otherwise you may hit. Somewhere around 2-3X your &#8220;typical&#8221; flush count should be OK.</li>
<li><strong>Turn off compound file format.</strong><br />
Call  <em>setUseCompoundFile(false)</em>. Building the compound file format takes time during indexing (7-33% in testing). However, note that doing this will greatly increase the number of file descriptors used by indexing and by searching, so you could run out of file descriptors if <em>mergeFactor</em> is also large.</li>
<li><strong>Re-use Document and Field instances</strong><br />
As of Lucene 2.3 (not yet released) there are new setValue(&#8230;) methods that allow you to change the value of a Field. This allows you to re-use a single Field instance across many added documents, which can save substantial GC cost.</p>
<p>It&#8217;s best to create a single Document instance, then add multiple Field instances to it, but hold onto these Field instances and re-use them by changing their values for each added document. For example you might have an idField, bodyField, nameField, storedField1, etc. After the document is added, you then directly change the Field values (idField.setValue(&#8230;), etc), and then re-add your Document instance.</p>
<p>Note that you cannot re-use a single Field instance within a Document, and, you should not change a Field&#8217;s value until the Document containing that Field has been added to the index. See <em>Field</em> for details.</li>
<li><strong>Re-use a single Token instance in your analyzer</strong><br />
Analyzers often create a new Token for each term in sequence that needs to be indexed from a Field. You can save substantial GC cost by re-using a single Token instance instead.</li>
<li><strong>Use the char[] API in Token instead of the String API to represent token Text</strong><br />
As of Lucene 2.3 (not yet released), a Token can represent its text as a slice into a char array, which saves the GC cost of new&#8217;ing and then reclaiming String instances. By re-using a single Token instance and using the char[] API you can avoid new&#8217;ing any objects for each term. See <em>Token</em> for details.</li>
<p>Shamelessly plugged from <a href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">here</a> </p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/10/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/10/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/10/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/10/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/10/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=10&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/08/24/improving-indexing-performance/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>looking for a job as a fashion designer for an import/export company</title>
		<link>http://findmeajob.wordpress.com/2007/08/08/looking-for-a-job-as-a-fashion-designer-for-an-importexport-company/</link>
		<comments>http://findmeajob.wordpress.com/2007/08/08/looking-for-a-job-as-a-fashion-designer-for-an-importexport-company/#comments</comments>
		<pubDate>Wed, 08 Aug 2007 10:36:33 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[Free Text Search]]></category>
		<category><![CDATA[keyword search]]></category>
		<category><![CDATA[SEO]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/08/08/looking-for-a-job-as-a-fashion-designer-for-an-importexport-company/</guid>
		<description><![CDATA[A simple keyword search &#8220;looking for a job as a fashion designer for an import/export company&#8221; on big three job search engines in India gives interesting results: Naukri which claims to be number one jobs site provides no results for this query. Timesjobs which takes ions to provide the results, which are way off from [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=9&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p> A simple keyword search &#8220;looking for a job as a fashion designer for an import/export company&#8221; on big three job search engines in India gives interesting results:</p>
<ul>
<li>
       <strong>Naukri</strong> which claims to be number one jobs site provides no results for this query.
        </li>
<li>
       <strong>Timesjobs</strong> which takes ions to provide the results, which are way off from the theme of the query. </li>
<li>
       <strong>Monster India</strong> barely provides decent results for the query.</li>
</ul>
<p>Going into the reasons why this query results in abject failure from such premiere jobs sites requires bit of dis-integration of the query.</p>
<ul>
<li> We have a well formed sentence with lots of what are called <a href="http://en.wikipedia.org/wiki/Stopwords">Stopwords</a>. After query parsing phase ideally query should be left with <strong>job, fashion designer, import/export and company</strong>. These keywords are only relevant to the query. This is where TimesJobs fails. </li>
<li>Most search engines set equal priority field priority. Monsterindia brings itself apart by giving higher priority to title of the jobs.</li>
<li>Detecting domain and job type would be a great way of enhancing keyword search. None of the engines do that till now.</li>
<li>import/export has a special character &#8216;/&#8217; which is not handled well by search engines.</li>
</ul>
<p>A good way to get these thing sorted would be to pre-process queries with appropriate analyzer.</p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/9/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/9/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/9/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/9/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/9/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=9&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/08/08/looking-for-a-job-as-a-fashion-designer-for-an-importexport-company/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
		<item>
		<title>DateField Howto</title>
		<link>http://findmeajob.wordpress.com/2007/08/01/datefield-howto/</link>
		<comments>http://findmeajob.wordpress.com/2007/08/01/datefield-howto/#comments</comments>
		<pubDate>Wed, 01 Aug 2007 06:19:51 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[apache]]></category>
		<category><![CDATA[fields]]></category>
		<category><![CDATA[index]]></category>
		<category><![CDATA[lucene]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/08/01/datefield-howto/</guid>
		<description><![CDATA[Time is son of a bitch. More you think, more you realize, time is a constraint. Ever so true for search engines. Time is used to restrict query bounds. It is used often and frequently the way time is stored in indices is botched up. Frequently used way of storing date and time Date: 12-03-2007 [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=8&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><img src='http://findmeajob.files.wordpress.com/2007/08/calendar.thumbnail.jpg?w=510' alt='calendar.jpg' /><br />
<strong>Time is son of a bitch</strong>. More you think, more you realize, time is a constraint. Ever so true for search engines. Time is used to restrict query bounds. It is used often and frequently the way time is stored in indices is botched up.</p>
<p><strong>Frequently used way of storing date and time</strong><br />
<em>Date</em>: 12-03-2007<br />
<em>Time</em>: 12:40:10<br />
Its great from viewing point of view, but from search engine perspective its plain old stupid. Search engine would need to do a full identifier match through out the index to find a particular date and time. Lets assume a case of three dates.</p>
<ul>
<li>12-04-2007 22:00</li>
<li>12-03-2006 10:00</li>
<li>12-03-2007  22:00</li>
</ul>
<p>Now if search query is looking for 12-03-2007 22:00 it will talk through all the fields to reach last row. Something on lines of:</p>
<ul>
<li><strong>12-04</strong>-2007 22:00 <em>not a match</em> </li>
<li><strong>12-03-2006</strong> 10:00<em>not a match</em> </li>
<li><strong>12-03-2007  22:00</strong> <em>a match</em> </li>
</ul>
<p> Search engine walked about 33 characters on index to reach a conclusion that third row is a match.</p>
<p><strong>Magic of morphological ordering</strong><br />
By changing the date and time a little to something like YYYYMMDDHHMMSS we can get a fair bit of speed advantage. So above date and time would look like:</p>
<ul>
<li>200704122200</li>
<li>200603121000</li>
<li>200703122200</li>
</ul>
<p>Looking at number of operations for same query</p>
<ul>
<li><strong>200704</strong>122200 <em>not a match</em></li>
<li><strong>2006</strong>03121000 <em>not a match</em></li>
<li><strong>200703122200</strong>  <em>a match</em></li>
</ul>
<p>Search engine walked about 24 characters on index to reach a conclusion that third row is a match. If you notice, in case of second row it took 4 characters for search engine to conclude a mismatch. </p>
<p><strong>Range Query</strong><br />
Range query is a  search query with constraint value bounds. Lets assume we need something between  12-03-2007 to 12-04-2007. With morphologically ordered date/time we convert the values in the index into integers and calculate if a row is between 20070312000000 and 20070412240000. This operation is by <strong>many orders</strong> simpler than doing a string match.</p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/8/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/8/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/8/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/8/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/8/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=8&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/08/01/datefield-howto/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>

		<media:content url="http://findmeajob.files.wordpress.com/2007/08/calendar.thumbnail.jpg" medium="image">
			<media:title type="html">calendar.jpg</media:title>
		</media:content>
	</item>
		<item>
		<title>Lucene SingleSearcher vs. MultiSearcher</title>
		<link>http://findmeajob.wordpress.com/2007/07/31/lucene-singlesearcher-vs-multisearcher/</link>
		<comments>http://findmeajob.wordpress.com/2007/07/31/lucene-singlesearcher-vs-multisearcher/#comments</comments>
		<pubDate>Tue, 31 Jul 2007 12:47:31 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[apache]]></category>
		<category><![CDATA[lucene]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>
		<category><![CDATA[web2.0]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/07/31/lucene-singlesearcher-vs-multisearcher/</guid>
		<description><![CDATA[A design problem many sites deploying search engine would face, using SingleSearcher vs. MultiSearcher. Lucene gives access to search capability using a Searcher class. Searcher class accepts a query and returns list of Hits sorted by default by relevance. Searcher is an abstract class with possibility of wrangling up customized concrete Searcher. Two already available [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=6&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><img src='http://findmeajob.files.wordpress.com/2007/07/test.jpg?w=150&#038;h=150' alt='Index' height="150" width="150" /><br />
A design problem many sites deploying search engine would face, using <strong>SingleSearcher vs. MultiSearcher</strong>. <strong>Lucene</strong> gives access to search capability using a <strong>Searcher</strong> class. Searcher class accepts a query and returns list of Hits sorted by default by relevance. Searcher is an abstract class with possibility of wrangling up customized concrete Searcher. Two already available Searcher classes are <strong>IndexSearcher</strong>  which loads an lucene index from disk and <strong>MultiSearcher</strong> which loads a list of lucene indices. MultiSearcher does an additional step of running merge sort after indices return the results.</p>
<p><strong>Why the question of IndexSearcher Vs. MultiSearcher</strong><br />
While pondering in a meeting room with nothing but an empty drawing board, it wouldn&#8217;t take much time for a design team to come to the conclusion that certain search criterion would be used more than other. Now simple thing would be to make a small manageable indices for that specific criterion and a separate index for general search. </p>
<p><strong>Why not to take this decision on outset</strong></p>
<ul>
<li>Lucene in default configuration is fast enough for most search requirements. Don&#8217;t use it as a premature optimization</li>
<li>It is not good option for distibuting indices over many disks. Its easier to put disks in RAID 0 configuration</li>
<li>Its simpler to maintain single index configuration</li>
<li>It involves extra cost of running a merge sort</li>
</ul>
<p>Some situations it makes sense to distribute indices because the frequency on particular search criterion is too skewed. Still in that case using many indices with load balancer would be better. MultiSearcher does fulfills certain niche, its a premature optimization for most.</p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/6/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/6/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/6/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/6/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/6/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=6&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/07/31/lucene-singlesearcher-vs-multisearcher/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>

		<media:content url="http://findmeajob.files.wordpress.com/2007/07/test.jpg" medium="image">
			<media:title type="html">Index</media:title>
		</media:content>
	</item>
		<item>
		<title>Search engine for job sites</title>
		<link>http://findmeajob.wordpress.com/2007/01/10/lucene-field-types-and-using-them-judiciously/</link>
		<comments>http://findmeajob.wordpress.com/2007/01/10/lucene-field-types-and-using-them-judiciously/#comments</comments>
		<pubDate>Wed, 10 Jan 2007 11:49:31 +0000</pubDate>
		<dc:creator>thedjinn</dc:creator>
				<category><![CDATA[apache]]></category>
		<category><![CDATA[fields]]></category>
		<category><![CDATA[index]]></category>
		<category><![CDATA[indices]]></category>
		<category><![CDATA[job]]></category>
		<category><![CDATA[lucene]]></category>
		<category><![CDATA[search]]></category>
		<category><![CDATA[vertical search]]></category>

		<guid isPermaLink="false">http://findmeajob.wordpress.com/2007/07/09/lucene-field-types-and-using-them-judiciously/</guid>
		<description><![CDATA[Most of you are probably familiar with 80/20 rule. The rule states that 80% of results come from 20% of causes. In job search this rule is even more extreme. A great search engine can quickly becomes addictive for a head-hunter. A smashing search engine for the portal can help grow the site so rapidly, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=3&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Most of you are probably familiar with 80/20 rule. The rule states that 80% of results come from 20% of causes. In job search this rule is even more extreme.  A great search engine can quickly becomes addictive for a head-hunter. </p>
<p>A smashing search engine for the portal can help grow the site so rapidly, so its important to do everything to make search, from good enough to great. If you are starting out, you will need to do more to make an impact. </p>
<p><strong>What makes a good job search engine</strong><br />
Jobs search comes in all shapes and sizes but they share important qualities.</p>
<ul>
<li><em>Simple</em> The search engine needs to be simple to use. Complex forms are disturbing. The level of complexity could be viewed if required. Instead of bringing up 40 inputs in one go, a logical set of related fields could be made hidden or visible according to user input.
</li>
<li><em>Fast</em> The search data can become large, yet being able to sail through it to provide the relevant. Faster search allows user to run more searches and refine search better.
</li>
<li><em>Saved Search</em> Being able to define a query and run it frequently is a great option. Many individuals look for same kind of profile over and over, looking up most relevant resumes.
</li>
<li><em>Sub-query</em>  Being able refine query and search through set made through previous search. An individual for example searched for Java and from the result  set of that query find person who also happens to be well versed with C++.
</ul>
<p><strong>Using Lucene</strong><br />
<a href="http://lucene.apache.org">Lucene</a> is open source search engine backend library. Lucene could be used for indexing GBs of data. </p>
<p><strong>Lucene Indexes</strong><br />
Lucene stores data in a search index. Lucene is index is very similar to &#8216;Index&#8217; section of a book. Lets assume 4 documents containing various set of words.  </p>
<p><strong>Normal index</strong><br />
Doc1 &#8211; Software Engineer, Java, C++<br />
Doc2 &#8211; Sales, Tele-Sales<br />
Doc3 &#8211; HR, Headhunting<br />
Doc4 &#8211; Sales, Manager</p>
<p><strong>Inverted Index</strong><br />
C++ &#8211; Doc1<br />
Headhunting &#8211; Doc3<br />
HR &#8211; Doc3<br />
Java &#8211; Doc1<br />
Manager &#8211; Doc4<br />
Sales &#8211; Doc2, Doc4<br />
Software Engineer &#8211; Doc1<br />
Tele-Sales &#8211; Doc2</p>
<p>Lucene uses inverted index which as you can see is easy to lookup for a word &#8216;Tele&#8217;. We can quickly work out Doc2 contains it. In normal index all documents would be needed to be read to get to same conclusion. Lucene indexes are FAST</p>
<p><strong>Storing data in indexes</strong><br />
While fast, indexes can be bogged down in case, those are not used correctly. Lucene indexes gives five options for field type to store the search data</p>
<ul>
<li><em>String</em> field type is used for keyword identifiers. Most pertinent usage is for proper nouns which independently identify a context. Someones name, location, job profile.
</li>
<li><em>Numeric</em> field is bunch of field types. One could store them as text version of number. But best option is to convert into string numeric type. Doing this means, lucene changes the number into morphologically ordered text making querying fast.
</li>
<li><em>Date</em> field should be stored with DateField class, which converts date/time into YYYYMMDDHHMMSS form which speeds up morphological search and range queries.
</li>
<li><em>SortField</em> field is a tricky business. A good example of SortField is to use it when  search  requires sorting other than relevance based like date of resume posted.
</li>
<li><em>Text</em> field is where heart and soul of lucene rests. Text fields are just large unstructured text which could be  analyzed using various analysis sequences in lucene and indexed. This allows you to run full text query of these fields. What is of vital importance is to find analysis sequence which best suits your domain. If minimal analysis is used the index can become large and irrelevant, if its made to be too aggressive, it can leave blind spots on important search terms.
</li>
</ul>
<p>You can also set flags on fields which tells lucene how to treat the field. </p>
<ul>
<li><em>Stored</em> should be set to <em>True</em> in case a field needs to be displayed.
  </li>
<li><em>Indexed</em> should be set to <em>True</em> in case a field needs to be search-able.
    </li>
<li><em>Tokenized</em> should be set to <em>True</em> in case a field needs to go through analysis process before indexing</li>
<li><em>Compressed</em> should be set to <em>True</em> if the field need to be compressed on disk. Lucene can search through compressed fields
</li>
</ul>
<p>Although it does not fulfill all the areas but Lucene provides a great starting point for a smashingly great search engine component for job search. </p>
<br /><img alt="" border="0" src="http://feeds.wordpress.com/1.0/categories/findmeajob.wordpress.com/3/" /> <img alt="" border="0" src="http://feeds.wordpress.com/1.0/tags/findmeajob.wordpress.com/3/" /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/findmeajob.wordpress.com/3/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/findmeajob.wordpress.com/3/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/findmeajob.wordpress.com/3/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=findmeajob.wordpress.com&amp;blog=1344134&amp;post=3&amp;subd=findmeajob&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://findmeajob.wordpress.com/2007/01/10/lucene-field-types-and-using-them-judiciously/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9bae1d7a91cd9e390a4d8d92c2636aa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thedjinn</media:title>
		</media:content>
	</item>
	</channel>
</rss>
