Joshua is an open-source MT system developed at Johns Hopkins University. It uses a hierarchical phrase-based translation model. What follows below are step-by-step instructions. This may look like a long list at first glance, but it should make it straightforward to build a machine translation system and all its components, and it should make the process of tuning, testing, and evaluating it transparent.
These instructions are adapted from Chris Callison-Burch's Joshua guide. More instructions and documentation for the use of Thrax, the translation model extractor, can be found on its github wiki.
If you have problems running this pipeline, please email jonny at cs dot jhu dot edu. Say something about WMT11 baseline in your subject line.
export SRILM=/path/to/srilm
export JAVA_HOME=/Library/Java/Home (on OSX, other OSes are different)
tar xzf joshua.tar.gz
cd joshua
ant
wget http://apache.cs.utah.edu//hadoop/core/hadoop-0.20.2/hadoop-0.20.2.tar.gz
tar -xzf hadoop-0.20.2.tar.gz
wget http://ds60ft5bv5jal.cloudfront.net/aws-java-sdk-1.1.3.zip
unzip aws-java-sdk-1.1.3.zip
export HADOOP=/path/to/hadoop
export AWS_SDK=/path/to/aws/sdk
git clone https://github.com/jweese/thrax.git
ant
tar xzf scripts.tgz
scripts/tokenizer.perl
scripts/lowercase.perl
scripts/wrap-xml.perl
mkdir -p working-dir/corpus
scripts/tokenizer.perl -l fr < wmt08/training/europarl-v3.fr-en.fr > working-dir/corpus/europarl.tok.fr
scripts/tokenizer.perl -l en < wmt08/training/europarl-v3.fr-en.en > working-dir/corpus/europarl.tok.en
scripts/lowercase.perl < working-dir/corpus/europarl.tok.fr > working-dir/corpus/europarl.lowercased.fr
scripts/lowercase.perl < working-dir/corpus/europarl.tok.en > working-dir/corpus/europarl.lowercased.en
word-align.conf
(example here)
mkdir -p example/test
java -d64 -Xmx10g -jar /path/to/aligner/berkeleyaligner.jar ++word-align.conf
cp working-dir/alignments/europarl.align working-dir/corpus/europarl.fr-en.alignments
mkdir -p working-dir/lm
scripts/tokenizer.perl -l en < wmt08/training/europarl-v3.en > working-dir/lm/europarl.tok
scripts/lowercase.perl < working-dir/lm/europarl.tok > working-dir/lm/europarl.lowercased
/path-to-srilm/bin/i686/ngram-count -order 5 -interpolate -kndiscount -text working-dir/lm/europarl.lowercased -lm working-dir/lm/europarl.lm
paste working-dir/corpus/europarl.lowercased.fr working-dir/corpus/europarl.lowercased.en working-dir/corpus/europarl.fr-en.alignments | perl -pe 's/\t/ ||| /g' >working-dir/corpus/europarl.unified
hadoop jar $THRAX/bin/thrax.jar thrax.conf europarl
hadoop fs -getmerge europarl working-dir/corpus/grammar
$THRAX/scripts/create_glue_grammar.sh thrax.conf <working-dir/corpus/grammar >working-dir/corpus/glue.grammar
mkdir -p working-dir/tuning
scripts/tokenizer.perl -l fr < wmt08/dev/dev2006.fr > working-dir/tuning/input.tok
scripts/tokenizer.perl -l en < wmt08/dev/dev2006.en > working-dir/tuning/reference.tok
scripts/lowercase.perl < working-dir/tuning/input.tok > working-dir/tuning/input
scripts/lowercase.perl < working-dir/tuning/reference.tok > working-dir/tuning/reference
$THRAX/scripts/filter_rules.sh 10 working-dir/tuning/input <working-dir/corpus/grammar >working-dir/corpus/grammar.dev2006
working-dir/mert
java -cp $JOSHUA/bin joshua.zmert.ZMERT -maxMem 1500 mert/mert.config
mkdir -p working-dir/evaluation
cp mert/joshua.config.ZMERT.final working-dir/evaluation/joshua.config
mkdir -p working-dir/evaluation
scripts/tokenizer.perl -l fr < wmt08/devtest/devtest2006.fr > working-dir/evaluation/devtest2006.input.tok
scripts/tokenizer.perl -l en < wmt08/devtest/devtest2006.en > working-dir/evaluation/devtest2006.reference.tok
scripts/lowercase.perl < working-dir/evaluation/devtest2006.input.tok > working-dir/evaluation/devtest2006.input
scripts/lowercase.perl < working-dir/evaluation/devtest2006.reference.tok > working-dir/evaluation/devtest2006.reference
$THRAX/scripts/filter_rules.sh 10 working-dir/evaluation/devtest2006.input <working-dir/corpus/grammar >working-dir/corpus/grammar.devtest2006
working-dir/evaluation/joshua.config
tm_file=working-dir/corpus/grammar.devtest2006
java -Xmx1g -cp $JOSHUA/bin -Djava.library.path=$JOSHUA/lib -Dfile.encoding=utf8 joshua.decoder.JoshuaDecoder working-dir/evaluation/joshua.config working-dir/evaluation/devtest2006.input working-dir/evaluation/devtest2006.output
java -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand working-dir/evaluation/devtest2006.output working-dir/evaluation/devtest2006.output.1best
$SRILM/bin/macosx64/ngram-count -unk -order 5 -kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -text working-dir/lm/europarl.tok -lm working-dir/lm/training.TrueCase.5gram.lm
perl truecase-map.perl <working-dir/lm/europarl.tok >working-dir/lm/true-case.map
$SRILM/bin/macosx/disambig -lm working-dir/lm/training.TrueCase.5gram.lm -keep-unk -order 5 -map working-dir/lm/true-case.map -text working-dir/evaluation/devtest2006.output.1best | perl strip-sent-tags.perl > working-dir/evaluation/devtest2006.output.recased
scripts/detokenizer.perl -l en < working-dir/evaluation/devtest2006.output.recased > working-dir/evaluation/devtest2006.output.detokenized
scripts/wrap-xml.perl wmt08/devtest/devtest2006-ref.en.sgm en < working-dir/evaluation/devtest2006.output.detokenized > working-dir/evaluation/devtest2006.output.sgm
mteval-v11b.pl -r wmt08/devtest/devtest2006-ref.en.sgm -t working-dir/evaluation/devtest2006.output.sgm -s wmt08/devtest/devtest2006-src.fr.sgm -c
supported by the EuroMatrixPlus project
P7-IST-231720-STP
funded by the European Commission
under Framework Programme 7