ElasticSearch lässt sich auch als Backend/Index für die eigene Website nutzen. Dazu müssen ihr nur alle Inhalte bekannt gemacht werden.
elasticsearch
In Debian und Ubuntu ganz einfach mit:
Bisher liegen alle Inhalte nur als Markdown- und HTML-Dateien vor. In der Ausgabe der Suche soll aber keinerlei Formatierung erscheinen. Also auch keine HTML-Tags.
Daher werden die Markdown-Files nochmals konvertiert, diesmal nach PLAIN, also unformatiertem Text, z.B. mittels
Statt alle *.md
-Files einzeln zu konvertieren, übernimmt dies das Makefile aus Pandoc als Wiki-Engine nach Ergänzung des Target plain
:
Makefile
:
.SILENT:
INDIRS := $(shell find * -type d | sort)
INMD := $(shell find * -type f -name "*.md")
INHTML := $(shell find * -type f -name "*.html")
INSTATIC := $(shell find * -type f \( -name "*.js" -o -name "*.css" -o -name "*.pdf" -o -name "*.jpg" -o -name "*.png" \))
SITEMAPXML := sitemap.xml
SITEMAP := '<?xml version="1.0" encoding="UTF-8"?>\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\
$(shell for _in in ${INHTML} ; do printf "<url><loc>https://wiki.failover.de/$${_in}</loc></url>"; done)\
</urlset>'
TIDY := tidy -xml -utf8 -q
PANDOC := pandoc --toc -f markdown -t html5 --template template.html -s --highlight-style kate
PANPLAIN:= pandoc -f markdown -t plain
.PHONY: main html plain sitemap upload help
main: html sitemap
html:
$(info $@:)
for _in in ${INMD}; do \
${PANDOC} -o $${_in%.md}.html $$_in $${_in%.md}.yaml; \
done
plain:
$(info $@:)
for _in in ${INMD}; do \
${PANPLAIN} -o $${_in%.md}.plain $$_in $${_in%.md}.yaml; \
done
sitemap: html
$(info $@:)
$(info ${SITEMAPXML})
printf ${SITEMAP} | ${TIDY} - > ${SITEMAPXML}
upload:
rsync -auv --delete ./* yourhost.com:/var/www/html/your_path/
help :
echo ""
echo "make (main) - builds all in current directory"
echo "make html - creates all html files"
echo "make sitemap - creates a google sitemap: ${SITEMAPXML}"
echo "make plain - creates all *.plain files for elasticsearch"
echo "make help - this info"
echo ""
Ein kleines Perl-Skript übernimmt das crawling
, hangelt sich durch den Verzeichnisbaum und füttert ElasticSearch mit allen *.plain
- und den dazugehörigen *.yaml
-Dateien.
indexer.pl
#!/usr/bin/perl
use strict;
use warnings;
use Search::Elasticsearch;
use File::Find;
use Sysadm::Install qw( slurp );
use open ':locale';
use YAML::XS 'LoadFile';# DumpFile/;
my $idx = "wiki";
my( $base ) = "/your_path_to/pandoc/wiki";
my domain = "your_domain";
my author_default = "Max Mustermann"
my $es = Search::Elasticsearch->new( trace_to => ['File','log'] );
eval { $es->indices->delete( index => $idx ) };
eval { $es->indices->create( index => $idx ) };
find sub {
my $file = $File::Find::name;
return unless -f $file;
return if ! -T $file;
return if -s $file > 100_000;
return unless $file =~ /\.plain$/;
my $content = slurp $file, {utf8 => 0};
my $yaml = $File::Find::name;
$yaml =~ s/.plain$/.yaml/;
my $config = LoadFile($yaml);
my $url = $File::Find::dir . "/";
$url =~ s|^$base|https://$domain/|;
$url =~ s|[^/]$|/|;
$es->index(
index => $idx,
type => 'text',
body => {
content => $content,
url => $url,
author => $config->{author} ? $config->{author} : $author_default,
keywords=> $config->{keywords} ? $config->{keywords} : "No keywords found",
abstract=> $config->{abstract} ? $config->{abstract} : "No abstract found",
date => $config->{date} ? $config->{date} : "1970-01-01",
title => $config->{title} ? $config->{title} : "No title found",
}
);
print "Added $file\n";
}, $base;
Da Search::Elasticsearch
in Debian nicht als Paket vorliegt, kann man es entweder mit cpan
installieren, oder aber besser gleich selbst ein libsearch-elasticsearch-perl
-Paket bauen und installieren:
dh-make-perl --cpan Log::Any --build-source --build
sudo dpkg -i liblog-any-perl*.deb
dh-make-perl --cpan Search::Elasticsearch --build-source --build
sudo dpkg -i libsearch-elasticsearch-perl*.deb
Anschließend lässt sich der Index anlegen:
In Pandoc als Wiki-Engine wurden bereits einige Variablen statt in den *.md
-Files in extra *.yaml
-Dateien definiert. Dies ist Voraussetzung für das Funkrionieren des Perl-Skripts, das in ElasticSearch einige Felder anlegt und diese mit den aus den *.yaml
-Dateien gewonnenen Werten belegt.
Die einfachste Form der Suche ist, GET
-Anfragen mitCURL
direkt an ElasticSearch zu schicken:
curl -i 'localhost:9200/wiki/_search?q=wiki&size=1&from=0&pretty=1';echo
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 532
{
"took" : 27,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 55,
"max_score" : 0.3057859,
"hits" : [ {
"_index" : "wiki",
"_type" : "text",
"_id" : "XQ2Q5nPoSVai9U5e8zHv2Q",
"_score" : 0.3057859, "_source" : {"date":"2011-02-03","keywords":"pki","url":"https://wiki.failover.de/pki/","title":"PKI - Failover::Wiki","author":"Michael Mende","content":"PKI\n===\n\n \n\n","abstract":"No abstract found"}
} ]
}
}
Da ElasticSearch eine ungesicherte REST-Schnittstelle hat, sollte ihr ein Proxy vorgeschaltet werden, der alle nicht-GET
-Anfragen aussortiert. Und auch gleich die GET
-Anfragen limitiert:
...
location /search {
if ($request_method !~ ^(HEAD|GET)$ ){ return 405; }
disable_symlinks on;
set $from 0; # default start with data set #0
set $size 5; # default number of data sets
set $query ''; # default search term
if ($arg_from ~ ^\d+$) { set $from $arg_from;} # is_number?
if ($arg_size ~ ^[1-4]$) { set $size $arg_size;} # lower $size
set $query $arg_query; # search for term
set $args ''; # clear GET params
set $pretty 1; # human readable?
set $fields 'author,date,abstract,keywords,url,title'; # get these fields
set $default_operator 'AND'; # 'foo' AND 'bar'
rewrite ^(.*)$ /wiki/_search?pretty=$pretty&default_operator=$default_operator&fields=$fields&size=$size&from=$from&q=$query break;
include /etc/nginx/proxy_params;
proxy_pass http://localhost:9200;
proxy_redirect http://$host:9200/ /;
}
...
curl -i 'localhost:80/search?query=wiki&size=1&from=0&pretty=1';echo
HTTP/1.1 200 OK
Server: nginx
Date: Tue, 23 Jun 2015 11:14:27 GMT
Content-Type: application/json; charset=UTF-8
Content-Length: 603
Connection: keep-alive
{
"took" : 22,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 55,
"max_score" : 0.3057859,
"hits" : [ {
"_index" : "wiki",
"_type" : "text",
"_id" : "XQ2Q5nPoSVai9U5e8zHv2Q",
"_score" : 0.3057859,
"fields" : {
"abstract" : [ "No abstract found" ],
"author" : [ "Michael Mende" ],
"title" : [ "PKI - Failover::Wiki" ],
"keywords" : [ "pki" ],
"date" : [ "2011-02-03" ],
"url" : [ "https://wiki.failover.de/pki/" ]
}
} ]
}
}
test.html
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<!-- Latest compiled and minified JavaScript -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
<!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
<!-- Optional theme -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap-theme.min.css">
<script src="/search.js"></script>
</head>
<body>
...
<form id="search_form" class="navbar-form pull-left" role="search" accept-charset="utf-8">
<div class="input-group">
<input id="search_input" type="text" class="form-control" placeholder="Search" id="search" />
<div class="input-group-btn" >
<button id="search_button" class="btn btn-default" >
<span class="glyphicon glyphicon-search"></span>
</button>
</div>
</div>
</form>
...
<div class="container" id="search_result"></div>
<div class="container">
...
</div>
</body>
</html>
search.js
var query_size=5; // same limit as set in NginX
function clear_results() {
$("#search_result").empty();
$("#search_input").val("");
}
function set_url(query,query_from) {
return "/search?query="+query+"&size="+query_size+"&from="+query_from;
}
function update_results(query,query_from) {
$.getJSON(set_url(query,query_from), function(result){
clear_results();
var html="";
if (result.hits.total == 0) {
html = "<div class='alert alert-danger' role='alert'><span class='glyphicon glyphicon-exclamation-sign' aria-hidden='true'></span><span class='sr-only'>Error:</span> Nothing found for search item: <em>"+query+"</em>.<a title='Remove Search Result' href='#' onclick='clear_results();'><span class='pull-right glyphicon glyphicon-remove-circle'></span></a></div>";
$("#search_result").append(html);
return true;
}
html = "<div class='alert alert-success' role='alert'>" + result.hits.total + " hits for search item: <em>"+query+"</em>.<a title='Remove Search Result' href='#' onclick='clear_results();'><span class='pull-right glyphicon glyphicon-remove-circle'></span></a></div>";
$("#search_result").append(html);
$.each(result.hits.hits, function(i, field){
html = "<div>";
html+= "<div><big><a href='" + field.fields.url + "'>" + field.fields.title + "</a></big></div>";
html+= "<div>" + field.fields.keywords + " / " + field.fields.abstract + "</div>";
html+= "<div><small>" + field.fields.author + " (" + field.fields.date + ")</small></div>";
html+= "</div><br/>";
$("#search_result").append(html);
});
var pages=Math.ceil(result.hits.total/query_size);
html = "<nav><ul class='pagination pagination-sm'>";
if (query_from >= query_size) {
html+= "<li><a href='#' aria-label='Previous' onclick='update_results(\""+query+"\","+(query_from-query_size)+");'><span aria-hidden='true'>«</span></a></li>";
} else {
html+= "<li class='disabled'><a href='#' aria-label='Previous'><span aria-hidden='true'>«</span></a></li>";
}
for (var k=0; k<pages;k++) {
if (k*query_size == query_from) {
html+="<li class='active'><a href='#'>"+(k+1)+"</a></li>";
} else {
html+="<li><a href='#' onclick='update_results(\""+query+"\","+k*query_size+");'>"+(k+1)+"</a></li>";
}
}
if (query_from < result.hits.total-query_size) {
html+= "<li><a href='#' aria-label='Next' onclick='update_results(\""+query+"\","+(query_from+query_size)+");'><span aria-hidden='true'>»</span></a></li>";
} else {
html+= "<li class='disabled'><a href='#' aria-label='Next'><span aria-hidden='true'>»</span></a></li>";
}
html+="</ul></nav>";
$("#search_result").append(html);
});
}
$(document).ready(function(){
$("#search_form").submit(function(){
update_results($("#search_input").val(),0);
return false;
});
});