LMPX.COM |
Home | Linux | Mysql | PHP | XML | ||
|
|
|||
From: Derick Rethans Date: Tue May 20 15:04:57 2008 Subject: cvs: presentations / search-phptek8.xml /slides/search crawling.xml ezcsearch-design-ideas.xml ezcsearch-document-definition.xml
ezcsearch-document-definition2.xml ezcsearch-index.xml ezcsearch-search.xml ezcsearch-search2.xml ezcsearch.xml future.xml h-introduction.xml
index.xml lucene.xml marjory.xml mysql-fulltext.xml mysql-restrictions.xml mysql-types.xml own-gotchas.xml own-gotchas2.xml own-gotchas3.xml
own-tokenization.xml own.xml performance.xml resources.xml searching.xml solr.xml stem-japanese.xml stemming.xml title.xml
tokenize-domain-specific.xml tokenize-japanese.xml tokenize.xml zend-lucene-index.xml zend-lucene-search.xml zend-lucene.xml
derick Tue May 20 21:04:57 2008 UTC
Added files:
/presentations search-phptek8.xml
/presentations/slides/search crawling.xml
ezcsearch-design-ideas.xml
ezcsearch-document-definition.xml
ezcsearch-document-definition2.xml
ezcsearch-index.xml
ezcsearch-search.xml
ezcsearch-search2.xml ezcsearch.xml
future.xml h-introduction.xml
index.xml lucene.xml marjory.xml
mysql-fulltext.xml
mysql-restrictions.xml mysql-types.xml
own-gotchas.xml own-gotchas2.xml
own-gotchas3.xml own-tokenization.xml
own.xml performance.xml resources.xml
searching.xml solr.xml
stem-japanese.xml stemming.xml
title.xml tokenize-domain-specific.xml
tokenize-japanese.xml tokenize.xml
zend-lucene-index.xml
zend-lucene-search.xml zend-lucene.xml
Log:
- Added search slide.
http://cvs.php.net/viewvc.cgi/presentations/search-phptek8.xml?view=markup&rev=1.1
Index: presentations/search-phptek8.xml
+++ presentations/search-phptek8.xml
<?xml version="1.0" encoding="utf-8"?>
<presentation
template="css"
navmode="html"
navbarbackground="#4373b4"
navbartopiclinks="0"
navColor="#f1fbff"
logo1=""
stylesheet="presentations/slides/ezc/ez.css"
backgroundfixed="1" >
<topic>Search</topic>
<title>Haystacks and Needles</title>
<event>php|tek</event>
<location>Chicago, US</location>
<date>May 22nd, 2008</date>
<speaker>Derick Rethans</speaker>
<email>dr@ez.no</email>
<url>http://derickrethans.nl/talks.php</url>
<slide>slides/search/title.xml</slide>
<!-- INTRODUCTION -->
<slide>slides/toolbox/me.xml</slide>
<slide>slides/search/h-introduction.xml</slide>
<!--
INDEX:
- tokenizers
- analysers (japanese)
- stemming
- stop words
-->
<slide>slides/search/index.xml</slide>
<slide>slides/search/crawling.xml</slide>
<slide>slides/search/tokenize.xml</slide>
<slide>slides/search/tokenize-domain-specific.xml</slide>
<slide>slides/search/tokenize-japanese.xml</slide>
<slide>slides/search/stemming.xml</slide>
<slide>slides/search/stem-japanese.xml</slide>
<!--
SEARCH:
- boolean, operators, grouping
- facets
-->
<slide>slides/search/searching.xml</slide>
<!-- Methods -->
<!--
- MySQL fulltext
- database tied
- just "text"
- only for myisam
- distance between words does not matter (fuzzy in lucene/solr)
- it's a database, not a search engine
-->
<slide>slides/search/mysql-fulltext.xml</slide>
<slide>slides/search/mysql-types.xml</slide>
<slide>slides/search/mysql-restrictions.xml</slide>
<!--
- Your own implementation
- tokenization
-
-->
<slide>slides/search/own.xml</slide>
<slide>slides/search/own-tokenization.xml</slide>
<slide>slides/search/own-gotchas.xml</slide>
<slide>slides/search/own-gotchas2.xml</slide>
<slide>slides/search/own-gotchas3.xml</slide>
<!--
- Lucene intro
- Zend Lucene
-->
<slide>slides/search/lucene.xml</slide>
<slide>slides/search/zend-lucene.xml</slide>
<slide>slides/search/zend-lucene-index.xml</slide>
<slide>slides/search/zend-lucene-search.xml</slide>
<!--
- Solr intro
- java, webservice
- marjory
-->
<slide>slides/search/solr.xml</slide>
<slide>slides/search/marjory.xml</slide>
<!--
- ezc Search
- facets
- datatypes
- deals with objects
- interface idea
- extending with other analysers
- datatype masks, issue with dates
- interfaces to other backends: zend lucene, sphinx, google, xapian?
- Indexing -> extractors
-->
<slide>slides/search/ezcsearch.xml</slide>
<slide>slides/search/ezcsearch-design-ideas.xml</slide>
<slide>slides/search/ezcsearch-document-definition.xml</slide>
<slide>slides/search/ezcsearch-document-definition2.xml</slide>
<slide>slides/search/ezcsearch-index.xml</slide>
<slide>slides/search/ezcsearch-search.xml</slide>
<slide>slides/search/ezcsearch-search2.xml</slide>
<!-- performance notes -->
<slide>slides/search/performance.xml</slide>
<!-- future tech:
- More Like This
-->
<slide>slides/search/future.xml</slide>
<slide>slides/search/resources.xml</slide>
</presentation>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/crawling.xml?view=markup&rev=1.1
Index: presentations/slides/search/crawling.xml
+++ presentations/slides/search/crawling.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Crawling</title>
<break lines="3"/>
<list>
<bullet>Domain specific: file system, CMS, Google</bullet>
<bullet>Should indicate different fields of a document: title, description, meta-tags, body</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-design-ideas.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-design-ideas.xml
+++ presentations/slides/search/ezcsearch-design-ideas.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Requirements and Design</subtitle>
<list>
<bullet>Support for multiple backends</bullet>
<bullet>Abstract documents</bullet>
<bullet>Support for datatypes</bullet>
<bullet>Rich searching API, including facetted search</bullet>
<bullet>Easy query interface</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-document-definition.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-document-definition.xml
+++ presentations/slides/search/ezcsearch-document-definition.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Document Definition</subtitle>
<example><![CDATA[<?php
static public function getDefinition()
{
$n = new ezcSearchDocumentDefinition( 'ezcSearchSimpleArticle' );
$n->idProperty = 'id';
$n->fields['id'] =
new ezcSearchDefinitionDocumentField(
'id', ezcSearchDocumentDefinition::TEXT );
$n->fields['title'] =
new ezcSearchDefinitionDocumentField(
'title', ezcSearchDocumentDefinition::TEXT,
2, true, false, true );
$n->fields['body'] =
new ezcSearchDefinitionDocumentField(
'body', ezcSearchDocumentDefinition::TEXT,
1, false, false, true );
$n->fields['published'] =
new ezcSearchDefinitionDocumentField(
'published', ezcSearchDocumentDefinition::DATE );
$n->fields['url'] =
new ezcSearchDefinitionDocumentField(
'url', ezcSearchDocumentDefinition::STRING );
$n->fields['type'] =
new ezcSearchDefinitionDocumentField(
'type', ezcSearchDocumentDefinition::STRING,
0, true, false, false );
return $n;
}
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-document-definition2.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-document-definition2.xml
+++ presentations/slides/search/ezcsearch-document-definition2.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Document Definition, in XML</subtitle>
<example><![CDATA[<?xml version="1.0"?>
<document>
<field type="id">id</field>
<field type="text" boost="2">title</field>
<field type="text">summary</field>
<field inResult="false" type="html">body</field>
<field type="date">published</field>
<field type="string" multi="true">author</field>
</document>]]></example>
<break lines="2"/>
<blurb>Setting up the manager:</blurb>
<example><![CDATA[<?php
$backend = new ezcSearchSolrHandler;
$session = new ezcSearchSession(
$backend,
new ezcSearchXmlManager( $testFilesDir )
);
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-index.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-index.xml
+++ presentations/slides/search/ezcsearch-index.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Indexing</subtitle>
<example><![CDATA[<?php
$session = new ezcSearchSession(
$backend,
new ezcSearchXmlManager( $testFilesDir )
);
$a = new Article(
null, // id
'Test Article', // title
'This is an article to test', // description
'the body of the article', // body
time() // published
);
$session->index( $a );
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-search.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-search.xml
+++ presentations/slides/search/ezcsearch-search.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Search - API</subtitle>
<example><![CDATA[<?php
$session = new ezcSearchSession(
$backend,
new ezcSearchXmlManager( $testFilesDir )
);
$q = $session->createFindQuery( 'Article' );
$q->where( $q->eq( 'title', 'Article' ) );
->limit( 5 );
->orderBy( 'id' );
$r = $session->find( $q );
foreach ( $r->documents as $document )
{
echo $document['document']->title, "\n";
}
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch-search2.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch-search2.xml
+++ presentations/slides/search/ezcsearch-search2.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<subtitle>Search - Query Builder</subtitle>
<example><![CDATA[<?php
$session = new ezcSearchSession(
$backend,
new ezcSearchXmlManager( $testFilesDir )
);
$q = $session->createFindQuery( 'Article' );
new ezcSearchQueryBuilder(
$q,
'thunderball',
array( 'fieldOne', 'fieldTwo' )
);
$q->facet( 'title' ); // keyword data field
$r = $session->find( $q );
foreach ( $r->documents as $document )
{
echo $document['document']->title, "\n";
}
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/ezcsearch.xml?view=markup&rev=1.1
Index: presentations/slides/search/ezcsearch.xml
+++ presentations/slides/search/ezcsearch.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>eZ Components' Search component</title>
<list>
<bullet>As our current solution doesn't scale, we needed something new</bullet>
<bullet>As Lucene is really good, the first attempt was to use the Java bridge</bullet>
<bullet>Then Solr came out</bullet>
<bullet>Which we integrated in "eZ Find"</bullet>
<bullet>Next versions of eZ Publish will use eZ Components</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/future.xml?view=markup&rev=1.1
Index: presentations/slides/search/future.xml
+++ presentations/slides/search/future.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Future improvements</title>
<list>
<bullet>More backends: google, marjory, sphinx, xapian, yahoo</bullet>
<bullet>More features: SpellChecker, MoreLikeThis</bullet>
<bullet>Search component in the new eZ Components 2008.1 release</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/h-introduction.xml?view=markup&rev=1.1
Index: presentations/slides/search/h-introduction.xml
+++ presentations/slides/search/h-introduction.xml
<?xml version="1.0" encoding="utf-8"?>
<slide template="title">
<blurb class="title_blurb">Introduction</blurb>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/index.xml?view=markup&rev=1.1
Index: presentations/slides/search/index.xml
+++ presentations/slides/search/index.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Indexing</title>
<break lines="3"/>
<blurb>Before you can search, you need to index.</blurb>
<break lines="3"/>
<blurb>Indexing requires:</blurb>
<list>
<bullet>Finding the documents to index (crawl)</bullet>
<bullet>Separate the documents into indexable units (tokenizing)</bullet>
<bullet>Massage the found units (stemming)</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/lucene.xml?view=markup&rev=1.1
Index: presentations/slides/search/lucene.xml
+++ presentations/slides/search/lucene.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Apache Lucene</title>
<blurb>Apache Lucene is a high-performance, full-featured text search
engine library written entirely in Java. It is a technology suitable for nearly
any application that requires full-text search, especially
cross-platform.</blurb>
<break lines="2"/>
<list>
<bullet>Implemented in Java</bullet>
<bullet>Provides indexing and searching libraries</bullet>
<bullet>Ranked searching -- best results returned first</bullet>
<bullet>Many powerful query types: phrase queries, wildcard queries, proximity queries, range queries and more</bullet>
<bullet>Fielded searching (e.g., title, author, contents)</bullet>
<bullet>Date-range searching</bullet>
<bullet>Sorting by any field</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/marjory.xml?view=markup&rev=1.1
Index: presentations/slides/search/marjory.xml
+++ presentations/slides/search/marjory.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Marjory</title>
<blurb>Marjory is a webservice for indexing and searching for documents,
utilizing a full-text search engine.</blurb>
<list>
<bullet>It is somewhat similar to Solr, but is written in PHP and the underlying architecture allows for using search engines other than Lucene (no other adaptor is implemented yet, though).</bullet>
<bullet>Marjory is based on the Zend Framework and uses Zend_Search_Lucene as the default search engine. </bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/mysql-fulltext.xml?view=markup&rev=1.1
Index: presentations/slides/search/mysql-fulltext.xml
+++ presentations/slides/search/mysql-fulltext.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>MySQL FULLTEXT</title>
<blurb>MySQL has support for full-text indexing and searching:</blurb>
<list>
<bullet>A full-text index in MySQL is an index of type FULLTEXT.</bullet>
<bullet>Full-text indexes can be used only with MyISAM tables, and can be created only for CHAR, VARCHAR, or TEXT columns. </bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/mysql-restrictions.xml?view=markup&rev=1.1
Index: presentations/slides/search/mysql-restrictions.xml
+++ presentations/slides/search/mysql-restrictions.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>MySQL FULLTEXT</title>
<list>
<bullet>Full-text searches are supported for MyISAM tables only.</bullet>
<bullet>Full-text searches can be used with multi-byte character sets. The exception is that for Unicode, but not the ucs2 character set.</bullet>
<bullet>Ideographic languages such as Chinese and Japanese do not have word delimiters. Therefore, the FULLTEXT parser cannot determine where words begin and end in these and other such languages.</bullet>
<bullet>Although the use of multiple character sets within a single table is supported, all columns in a FULLTEXT index must use the same character set and collation.</bullet>
</list>
<blurb class="quote">From http://dev.mysql.com/doc/refman/5.0/en/fulltext-restrictions.html</blurb>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/mysql-types.xml?view=markup&rev=1.1
Index: presentations/slides/search/mysql-types.xml
+++ presentations/slides/search/mysql-types.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>MySQL FULLTEXT</title>
<subtitle>Types</subtitle>
<blurb>Boolean search:</blurb>
<break/>
<example><![CDATA[mysql> SELECT * FROM articles WHERE MATCH (title,body)
-> AGAINST ('+MySQL -YourSQL' IN BOOLEAN MODE);
+----+-----------------------+-------------------------------------+
| id | title | body |
+----+-----------------------+-------------------------------------+
| 1 | MySQL Tutorial | DBMS stands for DataBase ... |
| 2 | How To Use MySQL Well | After you went through a ... |
| 3 | Optimizing MySQL | In this tutorial we will show ... |
| 4 | 1001 MySQL Tricks | 1. Never run mysqld as root. 2. ... |
| 6 | MySQL Security | When configured properly, MySQL ... |
+----+-----------------------+-------------------------------------+
]]></example>
<break/>
<blurb>Natural search (with or without query expansion):</blurb>
<break/>
<example><![CDATA[mysql> SELECT * FROM articles
-> WHERE MATCH (title,body) AGAINST ('database');
+----+-------------------+------------------------------------------+
| id | title | body |
+----+-------------------+------------------------------------------+
| 5 | MySQL vs. YourSQL | In the following database comparison ... |
| 1 | MySQL Tutorial | DBMS stands for DataBase ... |
+----+-------------------+------------------------------------------+]]>
</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/own-gotchas.xml?view=markup&rev=1.1
Index: presentations/slides/search/own-gotchas.xml
+++ presentations/slides/search/own-gotchas.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Own Implementation</title>
<blurb>Be careful to split with regular expressions - character set and
locale issues:</blurb>
<break/>
<example result='1'><![CDATA[<pre><?php
setlocale( LC_ALL, 'nb_NO.utf8');
var_dump( preg_split( '/\W/u', 'blårbærøl er greit' ) );
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/own-gotchas2.xml?view=markup&rev=1.1
Index: presentations/slides/search/own-gotchas2.xml
+++ presentations/slides/search/own-gotchas2.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Own Implementation</title>
<blurb>Be careful to split with regular expressions - character set and
locale issues:</blurb>
<break/>
<example result='1'><![CDATA[<pre>
<?php
$string = 'blårbærøl er greit';
$string = iconv( 'utf-8', 'latin1', $string );
setlocale( LC_ALL, 'nb_NO.iso-8859-1');
var_dump( preg_split( '/\W/', $string ) );
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/own-gotchas3.xml?view=markup&rev=1.1
Index: presentations/slides/search/own-gotchas3.xml
+++ presentations/slides/search/own-gotchas3.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Own Implementation</title>
<blurb>Doesn't work very well for huge amounts of content:</blurb>
<break/>
<example><![CDATA[mysql> use ezno;
Database changed
mysql> select count(*) from ezsearch_word;
212291
mysql> select count(*) from ezsearch_object_word_link;
15551310
]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/own-tokenization.xml?view=markup&rev=1.1
Index: presentations/slides/search/own-tokenization.xml
+++ presentations/slides/search/own-tokenization.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Own Implementation</title>
<list>
<bullet>Split text into tokens</bullet>
<bullet>Store tokens in a table, uniquely - with frequency</bullet>
<bullet>Store object / word location in a table, next/prev word, order</bullet>
</list>
<example>mysql> select * from ezsearch_word order by word limit 250, 4;
+------+--------------+--------------+
| id | object_count | word |
+------+--------------+--------------+
| 1761 | 1 | associations |
| 2191 | 1 | assurance |
| 349 | 37 | at |
+------+--------------+--------------+
mysql> select word_id, word from ezsearch_object_word_link wl, ezsearch_word w
where wl.word_id = w.id and contentobject_id = 145 order by placement limit 4;
+---------+--------+
| word_id | word |
+---------+--------+
| 1576 | puts |
| 926 | europe |
| 349 | at |
+---------+--------+
</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/own.xml?view=markup&rev=1.1
Index: presentations/slides/search/own.xml
+++ presentations/slides/search/own.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Own Implementation</title>
<list>
<bullet>We store content differently, so the FULLTEXT MySQL approach doesn't work</bullet>
<bullet>We need support for CJK</bullet>
<bullet>We needed support for other databases</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/performance.xml?view=markup&rev=1.1
Index: presentations/slides/search/performance.xml
+++ presentations/slides/search/performance.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Notes on Performance</title>
<subtitle>Solr vs Zend Lucene</subtitle>
<list>
<bullet>Both engines are not tuned</bullet>
<bullet>Solr indexes about 25% faster for small documents (one sentence)</bullet>
<bullet>Solr indexes about 200% faster for big documents (64kb)</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/resources.xml?view=markup&rev=1.1
Index: presentations/slides/search/resources.xml
+++ presentations/slides/search/resources.xml
<?xml version="1.0" encoding="ISO-8859-1"?>
<slide fontsize="6em">
<title>Resources</title>
<break lines="6" />
<link leader="Porter algoritihm: " href="http://telemat.det.unifi.it/book/2001/wchange/download/stem_porter.html" target="_new"/>
<link leader="Solr: " href="http://lucene.apache.org/solr/" target="_new"/>
<link leader="Zend Lucene: " href="http://framework.zend.com/manual/en/zend.search.lucene.html" target="_new"/>
<link leader="SnowBall: " href="http://snowball.tartarus.org/" target="_new"/>
<link leader="Mecab (In Japanese): " href="http://mecab.sourceforge.net/" target="_new"/>
<break/>
<link leader="These Slides: " href="http://derickrethans.nl/talks.php" target="_new"/>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/searching.xml?view=markup&rev=1.1
Index: presentations/slides/search/searching.xml
+++ presentations/slides/search/searching.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Searching</title>
<blurb>Different types of searches</blurb>
<break/>
<list>
<bullet>Search words, phrases, boolean: %airplane, "red wine", wine -red%</bullet>
<bullet>Field searches: %title:tutorial desc:feed%</bullet>
<bullet>Facetted search:</bullet>
</list>
<blurb><![CDATA[<a href='http://dev.ezcomponents.org/search'>demo</a>]]></blurb>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/solr.xml?view=markup&rev=1.1
Index: presentations/slides/search/solr.xml
+++ presentations/slides/search/solr.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Apache Solr</title>
<blurb>Solr is a standalone enterprise search server with a web-services like API. It extends Lucene:</blurb>
<break/>
<list>
<bullet>Real schema, with numeric types, dynamic fields, unique keys</bullet>
<bullet>Powerful extensions to the lucene query language</bullet>
<bullet>Support for dynamic faceted browsing and filtering</bullet>
<bullet>Advanced, configurable text analysis</bullet>
<bullet>Highly configurable and user extensible caching</bullet>
<bullet>Performance optimizations</bullet>
<bullet>External configuration via xml</bullet>
<bullet>An administration interface</bullet>
<bullet>Monitorable logging</bullet>
<bullet>Fast incremental updates and snapshot distribution</bullet>
<bullet>XML and CSV/delimited-text update formats</bullet>
</list>
<blurb><![CDATA[<a href='http://localhost:8983/solr/admin/'>demo</a>]]></blurb>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/stem-japanese.xml?view=markup&rev=1.1
Index: presentations/slides/search/stem-japanese.xml
+++ presentations/slides/search/stem-japanese.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Stemming</title>
<subtitle>Japanese</subtitle>
<example>
踊る odoru dance
踊らない odoranai doesn't dance
踊った odotta danced
踊らなかった odoranakatta didn't dance
踊れる odoreru can dance
踊れない odorenai can't dance
踊れた odoreta could dance
踊れなかった odorenakatta couldn't dance
踊っている odotteiru is dancing
踊っていない odotteinai isn't dancing
</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/stemming.xml?view=markup&rev=1.1
Index: presentations/slides/search/stemming.xml
+++ presentations/slides/search/stemming.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Stemming</title>
<blurb>Stemming normalizes words:</blurb>
<list>
<bullet>Porter stemming</bullet>
<bullet>It's language dependent: snowball</bullet>
<bullet>Several algorithms exist</bullet>
</list>
<example>arrival -> arrive
skies -> sky
riding -> ride
rides -> ride
horses -> hors</example>
<break lines="2"/>
<blurb>Alternatively, instead of word analysis you can use "sounds like"
indexing, but using something like soundex or metaphone:</blurb>
<break/>
<example>
Word Soundex Metaphone
stemming S355 STMNK
peas P200 PS
peace P200 PS
please P420 PLS</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/title.xml?view=markup&rev=1.1
Index: presentations/slides/search/title.xml
+++ presentations/slides/search/title.xml
<slide template="title" fontsize="3.5em">
<blurb class="session-title">:-:title:-:</blurb>
<break/>
<blurb class="event-date">:-:event:-: - :-:location:-:</blurb>
<break/>
<blurb class="event-presenter">:-:speaker:-: - dr@ez.no</blurb>
<break/>
<blurb class="pres-url-small">:-:url:-:</blurb>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/tokenize-domain-specific.xml?view=markup&rev=1.1
Index: presentations/slides/search/tokenize-domain-specific.xml
+++ presentations/slides/search/tokenize-domain-specific.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Tokenizing</title>
<subtitle>Domain specific</subtitle>
<blurb>Tokenization is domain specific</blurb>
<break/>
<list>
<bullet>You don't always want to split up letters from numbers - f.e. in product numbers.</bullet>
<bullet>You might want to exclude words (stop words)</bullet>
<bullet>You might want to filter out words that are short, or just long</bullet>
<bullet>You might want to define synonyms</bullet>
<bullet>You might want to normalize text (remove accents, Unicode forms)</bullet>
</list>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/tokenize-japanese.xml?view=markup&rev=1.1
Index: presentations/slides/search/tokenize-japanese.xml
+++ presentations/slides/search/tokenize-japanese.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Tokenizing</title>
<subtitle>Japanese</subtitle>
<blurb>There is little interpunction:</blurb>
<break/>
<blurb>辞書, コーパスに依存しない汎用的な設計</blurb>
<break/>
<blurb>You need special techniques to split it up into bits. Tools like
Kakasi and Mecab.</blurb>
<break/>
<blurb>Output from mecab:</blurb>
<example>
辞書, コーパスに依存しない汎用的な設計
辞書 名詞,普通名詞,*,*,辞書,じしょ,代表表記:辞書
, 特殊,記号,*,*,*,*,*
コーパス 名詞,普通名詞,*,*,*,*,*
に 助詞,格助詞,*,*,に,に,*
依存 名詞,サ変名詞,*,*,依存,いぞん,代表表記:依存
し 動詞,*,サ変動詞,基本連用形,する,し,付属動詞候補(基本) 代表表記:する
ない 接尾辞,形容詞性述語接尾辞,イ形容詞アウオ段,基本形,ない,ない,*
汎用 名詞,サ変名詞,*,*,汎用,はんよう,代表表記:汎用
的な 接尾辞,形容詞性名詞接尾辞,ナ形容詞,ダ列基本連体形,的だ,てきな,*
設計 名詞,サ変名詞,*,*,設計,せっけい,代表表記:設計
</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/tokenize.xml?view=markup&rev=1.1
Index: presentations/slides/search/tokenize.xml
+++ presentations/slides/search/tokenize.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Tokenizing</title>
<blurb>Making indexing parts out of text.</blurb>
<break/>
<blurb>Text</blurb>
<example>"This standard was developed from ISO/IEC 9075:1989"
Whitespace:
"This" "standard" "was" "developed" "from" "ISO/IEC" "9075:1989"
Continuous letters:
"This" "standard" "was" "developed" "from" "ISO" "IEC"
</example>
<break/>
<blurb>HTML</blurb>
<example>"<li><em>If it exists</em>, the STATUS of the W3C document.</li>"
"If" "it" "exists" "the" "status" "of" "the" "w3c" "document"
</example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/zend-lucene-index.xml?view=markup&rev=1.1
Index: presentations/slides/search/zend-lucene-index.xml
+++ presentations/slides/search/zend-lucene-index.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Zend Lucene</title>
<subtitle>Indexing Example</subtitle>
<blurb>Normal:</blurb>
<example><![CDATA[<?php
// Open existing index
$index = Zend_Search_Lucene::open('/data/my-index');
$doc = new Zend_Search_Lucene_Document();
// Store document URL to identify it in search result.
$doc->addField(Zend_Search_Lucene_Field::Text('url', $docUrl));
// Index document content
$doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $docContent));
// Add document to the index.
$index->addDocument($doc);
?>]]></example>
<break/>
<blurb>HTML document:</blurb>
<example><![CDATA[<?php
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename);
$index->addDocument($doc);
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/zend-lucene-search.xml?view=markup&rev=1.1
Index: presentations/slides/search/zend-lucene-search.xml
+++ presentations/slides/search/zend-lucene-search.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Zend Lucene</title>
<subtitle>Search Example</subtitle>
<blurb>With parser:</blurb>
<example><![CDATA[<?php
$index = Zend_Search_Lucene::open('/data/my-index');
$userQuery = Zend_Search_Lucene_Search_QueryParser::parse($queryStr);
$hits = $index->find($query);
?>]]></example>
<break/>
<blurb>With API:</blurb>
<example><![CDATA[<?php
$userQuery = Zend_Search_Lucene_Search_QueryParser::parse($queryStr);
$pathTerm = new Zend_Search_Lucene_Index_Term('/data/doc_dir/' . $filename, 'path');
$pathQuery = new Zend_Search_Lucene_Search_Query_Term($pathTerm);
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->addSubquery($userQuery);
$query->addSubquery($pathQuery);
$hits = $index->find($query);
?>]]></example>
</slide>
http://cvs.php.net/viewvc.cgi/presentations/slides/search/zend-lucene.xml?view=markup&rev=1.1
Index: presentations/slides/search/zend-lucene.xml
+++ presentations/slides/search/zend-lucene.xml
<?xml version="1.0" encoding="utf-8"?>
<slide>
<title>Zend Lucene</title>
<blurb>It's a port of Java Lucene to PHP</blurb>
<break lines="2"/>
<list>
<bullet>Compatible with the Lucene index format</bullet>
<bullet>Provides indexing and searching libraries</bullet>
<bullet>Supports some of the lucene query language</bullet>
<bullet>Support for indexing HTML documents: title, meta and body</bullet>
<bullet>Has support for different field types:</bullet>
</list>
<list>
<bullet>*Keyword*: Not tokenized</bullet>
<bullet>*UnIndexed*: Not indexed</bullet>
<bullet>*Binary*: Binary data</bullet>
<bullet>*Text*: Tokenized</bullet>
<bullet>*UnStored*: Tokenized, but only indexed</bullet>
</list>
</slide>
| Navigate in group php.pres at sever news.php.net | |
| Previous | Next |
| No Copyright You are free to use Anything |
Site Maintained by PHP Developer
Powered By PHP Consultants |