Difference between revisions of "DataBase"

From cslt Wiki
Jump to: navigation, search
Line 1: Line 1:
==table 1==
+
==lm==
 
{| class="wikitable"
 
{| class="wikitable"
! name !! type!! size !! dir !! description
+
! name !! size !! dir !! description
 
|-
 
|-
|863   ||speech||51h, 76spk || corpora/863 || 863 reading speech database. 16k,16bit
+
|SogouQ.full.3gram.gz   ||132M ||/work/lxs/nlphome/lm/SogouQ-500M ||
 
|-
 
|-
|emotion||speech||22h    ||corpora/emotion||emotional speech for SID, recorded in CSLT. 16k,16bit
+
|SogouQ.full.train.3gram.gz || 132M || /work/lxs/nlphome/lm/SogouQ-500M ||
 
|-
 
|-
|callhome ||text||9.02Mb|| corpora/callhome || callhome chinese speech database transcription
+
|SogouT-11w-merge2-1.3gram.gz || 4.1G ||/work/lxs/nlphome/lm/SogouT-140G ||
 
|-
 
|-
|tcmsd  ||speech ||34h,60spk ||corpora/tcmsd|| speech database recorded in Tsinghua, 2002. 16k, 16bit
+
|SogouT-11w-merge2-2.3gram.gz || 3.9G || /work/lxs/nlphome/lm/SogouT-140G ||
 
|-
 
|-
|timit  ||speech ||5.4h  ||corpora/timit || English timit database
+
|8w8.3gram.tencent.gz || 452M || /work/lxs/nlphome/lm/Tencent ||
 
|-
 
|-
|gigaword||text || 668MW ||corpora/chinese_gigaword || Gigaword text for Chinese
+
|musicQuery-ltc.3gram.gz || 28M || /work/lxs/nlphome/lm/TencentQ/musicQuery ||use qa15w-singer-songs.wordlist
 
|-
 
|-
|ulgur ||speech&text||xju: 141h (tr. 136h) xjnu: 8.54h ||corpora/ulgur ||ulgur speech and text data
+
|TencentQ.3gram.gz || 1.4G || /work/lxs/nlphome/lm/TencentQ/qa15w ||use qa15w.lexicion
 
|-
 
|-
|tvboard||speech ||-|| corpora/tvboard ||tv and broadcast no-transcribed archieve
+
|mix-corp1-corp2.3gram.gz || 1.3G ||/work/lxs/nlphome/lm/TencentQ/qa15w-nosinger-song||use qa15w-nosinger-song.wordlist
 
|-
 
|-
|weibo || text||10Gb|| corpora/weibo || English weibo text data
+
|mix-corp1_0.5-corp2_0.5.3gram.gz||1.4G||/work/lxs/nlphome/lm/TencentQ/qa15w-singer-song||use qa15w-singer-song.wordlist
 
|-
 
|-
|qa || text||124Gb|| corpora/qa || QA text data
+
|11w_merge6_kn.3gram.gz||4.3G||/work/lxs/nlphome/lm/TencentQA-100G||
 
|-
 
|-
|pvad||speech||5.4h||corpora/puqiang/VAD || speech data for VAD, from Pachira
+
|8w8_new_merge6_kn.3gram0.gz||4.5G||/work/lxs/nlphome/lm/TencentQA-100G||
 
|-
 
|-
|ppoi||speech||208h||corpora/puqiang/poi || 8k telephone speech in poi from Pachira
+
|Hunhe_zhongzi_and_add_and_PPL_5yuan_3e9.lm.utf8.1e-5.3gram.gz||1.4M||/work/lxs/nlphome/lm/jietong||
 
|-
 
|-
|T400||speech||400h||corpora/tencent ||speech data from Tencent
+
|Hunhe_zhongzi_and_add_and_PPL_5yuan_3e9.lm.utf8.1e-9.5gram.gz||389M||/work/lxs/nlphome/lm/jietong||
 +
|}
 +
 
 +
==lexicion wordlist==
 +
{| class="wikitable"
 +
! name !! size !! dir !! description
 
|-
 
|-
|dt700 ||speech||700h||corpora/tencent/dt700 ||700 hour reading speech data
+
|singer.lexicion||74k ||/work/lxs/nlphome/dict/lex-wordlist/music/lr ||
 
|-
 
|-
|legend-vod || speech ||-||corpora/legend-vod ||some test speech and vod
+
|singer.low.lexicion||74k||/work/lxs/nlphome/dict/lex-wordlist/music/lr||
 
|-
 
|-
|mobil-eng || speech ||26h||corpora/lenvxx/data/wav/mobil-eng ||english speech of chinese people
+
|singer.pinyin||44k||/work/lxs/nlphome/dict/lex-wordlist/music/lr||
 
|-
 
|-
|legend-online || speech ||54h||corpora/lenvxx/data/wav/real-online || online speech data
+
|song.lexicion||255k||/work/lxs/nlphome/dict/lex-wordlist/music/lr||
 
|-
 
|-
|legend-wakeup || speech ||1h||corpora/lenvxx/data/wav/wake-up || wake up test speech
+
|song.low.lexicion||255k||/work/lxs/nlphome/dict/lex-wordlist/music/lr||
 
|-
 
|-
|legend-reading || speech ||21h||corpora/lenvxx/data/wav/haitian || reading speech
+
|song.pinyin||167k||/work/lxs/nlphome/dict/lex-wordlist/music/lr||
 
|-
 
|-
|legend-sel-for-test || speech ||21h||corpora/lenvxx/data/wav/sel_for_test || reading speech
+
|qa15w-ch-sinovoice.lexicion||2.9M||/work/lxs/nlphome/dict/lex-wordlist/qa-check||
 
|-
 
|-
|POI-lexicon || lexicon ||-||corpora/lenvxx/data/lexicon || lexicon for POI applications
+
|qa15w-ch.pinyin||1.7M||/work/lxs/nlphome/dict/lex-wordlist/qa-check||
 
|-
 
|-
|NLPR || lexicon,categories ||-||corpora/lenvxx/data/text/nlpcorpus || resources of NLP tasks
+
|qa15w.lexicion||4.9M||/work/lxs/nlphome/dict/lex-wordlist/qa-check||
 
|-
 
|-
|serviceT || text ||-||corpora/lenvxx/data/text/service_text || text recorded from online service
+
|11w.lexicion||3.8M||/work/lxs/nlphome/dict/lex-wordlist/tencent||
 
|-
 
|-
|sougouText || text ||-||corpora/sogou || sogouQ and sogouT
+
|8w8.lexicion||2.5M||/work/lxs/nlphome/dict/lex-wordlist/tencent||
 +
|}
 +
 
 +
==nolexicion wordlist==
 +
{| class="wikitable"
 +
! name !! size !! dir !! description
 
|-
 
|-
|wsj  || speech ||100h||corpora/wsj ||wall-street journal speech db
+
|singer.wordlist||19k||/work/lxs/nlphome/dict/nolex-wordlist/music/lr||
 
|-
 
|-
|hownet || lexicon || - || corpora/hownet || HowNet relation db
+
|song.wordlist||68k||/work/lxs/nlphome/dict/nolex-wordlist/music/lr||
 
|-
 
|-
|casia || speech ||4000 u || corpora/tts/casia || male TTS speech
+
|album.txt||227k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
|-
 
|-
|huilan-tts || speech ||2000 u || corpora/tts/huilan || male/female TTS speech from Huilan
+
|area.txt||32bit||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
|-
 
|-
|tts-novel || speech ||20h  || corpora/tts/novel || speech data download from internet for tts
+
|chart.txt||336bit||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
|-
 
|-
|Sinovoice-tel || speech || 470h+300h || corpora/sinovoice/tel || telephone speech data from Sinovoice
+
|drama.txt||7.2k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
|-
 
|-
|Sinovoice-16k || speech || 6000h || corpora/sinovoice/16k || mobile 16k speech data from Sinovoice
+
|language.txt||343bit||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
|-
 
|-
|}
+
|singer.txt||42k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 
+
|-
==table 2==
+
|stopwords.txt||6.1k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
{| class="wikitable"
+
|-
! name !! type!! size !! dir !! description
+
|song.txt||408k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 +
|-
 +
|style.txt||6.6k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 +
|-
 +
|type.txt||18bit||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||
 +
|-
 +
|entity.txt||590k||/work/lxs/nlphome/dict/nolex-wordlist/music/ltc||merge album area chart drama language singer song stopwords style type
 +
|-
 +
|qa15w.wordlist||1.2M||/work/lxs/nlphome/dict/nolex-wordlist/qa-check||
 +
|-
 +
|11w.wordlist||888k||/work/lxs/nlphome/dict/nolex-wordlist/tencent||
 
|-
 
|-
|863    ||speech||51h, 76spk || corpora/863 || 863 reading speech database. 16k,16bit
+
|8w8.wordlist||666k||/work/lxs/nlphome/dict/nolex-wordlist/tencent||
 
|-
 
|-
 +
|scws20w-utf8.wordlist||6.5M||/work/lxs/nlphome/dict/nolex-wordlist||
 
|}
 
|}

Revision as of 06:25, 19 February 2014

lm

name size dir description
SogouQ.full.3gram.gz 132M /work/lxs/nlphome/lm/SogouQ-500M
SogouQ.full.train.3gram.gz 132M /work/lxs/nlphome/lm/SogouQ-500M
SogouT-11w-merge2-1.3gram.gz 4.1G /work/lxs/nlphome/lm/SogouT-140G
SogouT-11w-merge2-2.3gram.gz 3.9G /work/lxs/nlphome/lm/SogouT-140G
8w8.3gram.tencent.gz 452M /work/lxs/nlphome/lm/Tencent
musicQuery-ltc.3gram.gz 28M /work/lxs/nlphome/lm/TencentQ/musicQuery use qa15w-singer-songs.wordlist
TencentQ.3gram.gz 1.4G /work/lxs/nlphome/lm/TencentQ/qa15w use qa15w.lexicion
mix-corp1-corp2.3gram.gz 1.3G /work/lxs/nlphome/lm/TencentQ/qa15w-nosinger-song use qa15w-nosinger-song.wordlist
mix-corp1_0.5-corp2_0.5.3gram.gz 1.4G /work/lxs/nlphome/lm/TencentQ/qa15w-singer-song use qa15w-singer-song.wordlist
11w_merge6_kn.3gram.gz 4.3G /work/lxs/nlphome/lm/TencentQA-100G
8w8_new_merge6_kn.3gram0.gz 4.5G /work/lxs/nlphome/lm/TencentQA-100G
Hunhe_zhongzi_and_add_and_PPL_5yuan_3e9.lm.utf8.1e-5.3gram.gz 1.4M /work/lxs/nlphome/lm/jietong
Hunhe_zhongzi_and_add_and_PPL_5yuan_3e9.lm.utf8.1e-9.5gram.gz 389M /work/lxs/nlphome/lm/jietong

lexicion wordlist

name size dir description
singer.lexicion 74k /work/lxs/nlphome/dict/lex-wordlist/music/lr
singer.low.lexicion 74k /work/lxs/nlphome/dict/lex-wordlist/music/lr
singer.pinyin 44k /work/lxs/nlphome/dict/lex-wordlist/music/lr
song.lexicion 255k /work/lxs/nlphome/dict/lex-wordlist/music/lr
song.low.lexicion 255k /work/lxs/nlphome/dict/lex-wordlist/music/lr
song.pinyin 167k /work/lxs/nlphome/dict/lex-wordlist/music/lr
qa15w-ch-sinovoice.lexicion 2.9M /work/lxs/nlphome/dict/lex-wordlist/qa-check
qa15w-ch.pinyin 1.7M /work/lxs/nlphome/dict/lex-wordlist/qa-check
qa15w.lexicion 4.9M /work/lxs/nlphome/dict/lex-wordlist/qa-check
11w.lexicion 3.8M /work/lxs/nlphome/dict/lex-wordlist/tencent
8w8.lexicion 2.5M /work/lxs/nlphome/dict/lex-wordlist/tencent

nolexicion wordlist

name size dir description
singer.wordlist 19k /work/lxs/nlphome/dict/nolex-wordlist/music/lr
song.wordlist 68k /work/lxs/nlphome/dict/nolex-wordlist/music/lr
album.txt 227k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
area.txt 32bit /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
chart.txt 336bit /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
drama.txt 7.2k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
language.txt 343bit /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
singer.txt 42k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
stopwords.txt 6.1k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
song.txt 408k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
style.txt 6.6k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
type.txt 18bit /work/lxs/nlphome/dict/nolex-wordlist/music/ltc
entity.txt 590k /work/lxs/nlphome/dict/nolex-wordlist/music/ltc merge album area chart drama language singer song stopwords style type
qa15w.wordlist 1.2M /work/lxs/nlphome/dict/nolex-wordlist/qa-check
11w.wordlist 888k /work/lxs/nlphome/dict/nolex-wordlist/tencent
8w8.wordlist 666k /work/lxs/nlphome/dict/nolex-wordlist/tencent
scws20w-utf8.wordlist 6.5M /work/lxs/nlphome/dict/nolex-wordlist