Spaces:
Running
Running
File size: 20,237 Bytes
3f49919 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | import json
import sqlite3
import requests
import time
from pathlib import Path
DB_PATH = Path("data/quotes.db")
def create_database():
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS quotes (
id TEXT PRIMARY KEY,
text TEXT NOT NULL,
author TEXT,
source TEXT,
dynasty TEXT,
type TEXT,
tags TEXT,
emotion TEXT
)
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON quotes(type)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_dynasty ON quotes(dynasty)")
conn.commit()
return conn
def insert_quotes(conn, quotes):
cursor = conn.cursor()
inserted = 0
for quote in quotes:
try:
cursor.execute("""
INSERT OR IGNORE INTO quotes (id, text, author, source, dynasty, type, tags, emotion)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
quote['id'],
quote['text'],
quote.get('author'),
quote.get('source'),
quote.get('dynasty'),
quote.get('type'),
json.dumps(quote.get('tags', []), ensure_ascii=False),
json.dumps(quote.get('emotion', []), ensure_ascii=False)
))
if cursor.rowcount > 0:
inserted += 1
except Exception as e:
print(f"Error inserting {quote['id']}: {e}")
conn.commit()
return inserted
def fetch_chinese_poetry():
poems = []
base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master"
print("Fetching Tang Dynasty poems...")
for i in range(0, 58000, 1000):
url = f"{base_url}/poet/poet_{i}.json"
try:
response = requests.get(url, timeout=15)
if response.status_code == 200:
data = response.json()
for poem in data:
text = "".join(poem.get("paragraphs", []))
if len(text) >= 10:
poems.append({
"id": f"tang_{poem.get('id', i)}",
"text": text,
"author": poem.get("author", ""),
"source": poem.get("title", ""),
"dynasty": "唐",
"type": "诗词",
"tags": [],
"emotion": []
})
print(f" Fetched {len(data)} poems from batch {i}")
time.sleep(0.5)
except Exception as e:
print(f" Error fetching batch {i}: {e}")
return poems
def fetch_song_poetry():
poems = []
base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master"
print("Fetching Song Dynasty poems...")
for i in range(0, 25000, 1000):
url = f"{base_url}/poet/poet.song_{i}.json"
try:
response = requests.get(url, timeout=15)
if response.status_code == 200:
data = response.json()
for poem in data:
text = "".join(poem.get("paragraphs", []))
if len(text) >= 10:
poems.append({
"id": f"song_{poem.get('id', i)}",
"text": text,
"author": poem.get("author", ""),
"source": poem.get("title", ""),
"dynasty": "宋",
"type": "诗词",
"tags": [],
"emotion": []
})
print(f" Fetched {len(data)} poems from batch {i}")
time.sleep(0.5)
except Exception as e:
print(f" Error fetching batch {i}: {e}")
return poems
def fetch_idioms():
idioms = []
url = "https://raw.githubusercontent.com/pwxcoo/chinese-xinhua/master/json/idioms.json"
print("Fetching idioms...")
try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
data = response.json()
for item in data:
word = item.get('word', '')
if word and len(word) >= 3:
idioms.append({
"id": f"idiom_{word}",
"text": word,
"author": None,
"source": item.get('derivation', ''),
"dynasty": None,
"type": "成语",
"tags": [],
"emotion": []
})
print(f" Fetched {len(idioms)} idioms")
except Exception as e:
print(f" Error fetching idioms: {e}")
return idioms
def fetch_classic_quotes():
quotes = []
classic_texts = [
{"text": "学而时习之,不亦说乎", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "温故而知新,可以为师矣", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "学而不思则罔,思而不学则殆", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "知之为知之,不知为不知,是知也", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "己所不欲,勿施于人", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "三人行,必有我师焉", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "君子坦荡荡,小人长戚戚", "author": "孔子", "source": "论语", "dynasty": "春秋"},
{"text": "生于忧患,死于安乐", "author": "孟子", "source": "孟子", "dynasty": "战国"},
{"text": "得道多助,失道寡助", "author": "孟子", "source": "孟子", "dynasty": "战国"},
{"text": "富贵不能淫,贫贱不能移,威武不能屈", "author": "孟子", "source": "孟子", "dynasty": "战国"},
{"text": "天行健,君子以自强不息", "author": "佚名", "source": "周易", "dynasty": "先秦"},
{"text": "地势坤,君子以厚德载物", "author": "佚名", "source": "周易", "dynasty": "先秦"},
{"text": "路漫漫其修远兮,吾将上下而求索", "author": "屈原", "source": "离骚", "dynasty": "战国"},
{"text": "长太息以掩涕兮,哀民生之多艰", "author": "屈原", "source": "离骚", "dynasty": "战国"},
{"text": "亦余心之所善兮,虽九死其犹未悔", "author": "屈原", "source": "离骚", "dynasty": "战国"},
{"text": "老骥伏枥,志在千里", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"},
{"text": "烈士暮年,壮心不已", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"},
{"text": "山不厌高,海不厌深", "author": "曹操", "source": "短歌行", "dynasty": "东汉"},
{"text": "周公吐哺,天下归心", "author": "曹操", "source": "短歌行", "dynasty": "东汉"},
{"text": "捐躯赴国难,视死忽如归", "author": "曹植", "source": "白马篇", "dynasty": "三国"},
{"text": "本是同根生,相煎何太急", "author": "曹植", "source": "七步诗", "dynasty": "三国"},
{"text": "鞠躬尽瘁,死而后已", "author": "诸葛亮", "source": "后出师表", "dynasty": "三国"},
{"text": "非淡泊无以明志,非宁静无以致远", "author": "诸葛亮", "source": "诫子书", "dynasty": "三国"},
{"text": "采菊东篱下,悠然见南山", "author": "陶渊明", "source": "饮酒", "dynasty": "东晋"},
{"text": "羁鸟恋旧林,池鱼思故渊", "author": "陶渊明", "source": "归园田居", "dynasty": "东晋"},
{"text": "海内存知己,天涯若比邻", "author": "王勃", "source": "送杜少府之任蜀州", "dynasty": "唐"},
{"text": "落霞与孤鹜齐飞,秋水共长天一色", "author": "王勃", "source": "滕王阁序", "dynasty": "唐"},
{"text": "前不见古人,后不见来者", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"},
{"text": "念天地之悠悠,独怆然而涕下", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"},
{"text": "春江潮水连海平,海上明月共潮生", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"},
{"text": "人生代代无穷已,江月年年望相似", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"},
{"text": "欲穷千里目,更上一层楼", "author": "王之涣", "source": "登鹳雀楼", "dynasty": "唐"},
{"text": "黄河远上白云间,一片孤城万仞山", "author": "王之涣", "source": "凉州词", "dynasty": "唐"},
{"text": "独在异乡为异客,每逢佳节倍思亲", "author": "王维", "source": "九月九日忆山东兄弟", "dynasty": "唐"},
{"text": "劝君更尽一杯酒,西出阳关无故人", "author": "王维", "source": "送元二使安西", "dynasty": "唐"},
{"text": "大漠孤烟直,长河落日圆", "author": "王维", "source": "使至塞上", "dynasty": "唐"},
{"text": "明月松间照,清泉石上流", "author": "王维", "source": "山居秋暝", "dynasty": "唐"},
{"text": "天生我材必有用,千金散尽还复来", "author": "李白", "source": "将进酒", "dynasty": "唐"},
{"text": "长风破浪会有时,直挂云帆济沧海", "author": "李白", "source": "行路难", "dynasty": "唐"},
{"text": "举杯邀明月,对影成三人", "author": "李白", "source": "月下独酌", "dynasty": "唐"},
{"text": "抽刀断水水更流,举杯消愁愁更愁", "author": "李白", "source": "宣州谢朓楼饯别校书叔云", "dynasty": "唐"},
{"text": "安能摧眉折腰事权贵,使我不得开心颜", "author": "李白", "source": "梦游天姥吟留别", "dynasty": "唐"},
{"text": "会当凌绝顶,一览众山小", "author": "杜甫", "source": "望岳", "dynasty": "唐"},
{"text": "读书破万卷,下笔如有神", "author": "杜甫", "source": "奉赠韦左丞丈二十二韵", "dynasty": "唐"},
{"text": "出师未捷身先死,长使英雄泪满襟", "author": "杜甫", "source": "蜀相", "dynasty": "唐"},
{"text": "无边落木萧萧下,不尽长江滚滚来", "author": "杜甫", "source": "登高", "dynasty": "唐"},
{"text": "安得广厦千万间,大庇天下寒士俱欢颜", "author": "杜甫", "source": "茅屋为秋风所破歌", "dynasty": "唐"},
{"text": "忽如一夜春风来,千树万树梨花开", "author": "岑参", "source": "白雪歌送武判官归京", "dynasty": "唐"},
{"text": "沉舟侧畔千帆过,病树前头万木春", "author": "刘禹锡", "source": "酬乐天扬州初逢席上见赠", "dynasty": "唐"},
{"text": "旧时王谢堂前燕,飞入寻常百姓家", "author": "刘禹锡", "source": "乌衣巷", "dynasty": "唐"},
{"text": "千淘万漉虽辛苦,吹尽狂沙始到金", "author": "刘禹锡", "source": "浪淘沙", "dynasty": "唐"},
{"text": "野火烧不尽,春风吹又生", "author": "白居易", "source": "赋得古原草送别", "dynasty": "唐"},
{"text": "同是天涯沦落人,相逢何必曾相识", "author": "白居易", "source": "琵琶行", "dynasty": "唐"},
{"text": "在天愿作比翼鸟,在地愿为连理枝", "author": "白居易", "source": "长恨歌", "dynasty": "唐"},
{"text": "千呼万唤始出来,犹抱琵琶半遮面", "author": "白居易", "source": "琵琶行", "dynasty": "唐"},
{"text": "曾经沧海难为水,除却巫山不是云", "author": "元稹", "source": "离思", "dynasty": "唐"},
{"text": "黑云压城城欲摧,甲光向日金鳞开", "author": "李贺", "source": "雁门太守行", "dynasty": "唐"},
{"text": "男儿何不带吴钩,收取关山五十州", "author": "李贺", "source": "南园", "dynasty": "唐"},
{"text": "商女不知亡国恨,隔江犹唱后庭花", "author": "杜牧", "source": "泊秦淮", "dynasty": "唐"},
{"text": "停车坐爱枫林晚,霜叶红于二月花", "author": "杜牧", "source": "山行", "dynasty": "唐"},
{"text": "东风不与周郎便,铜雀春深锁二乔", "author": "杜牧", "source": "赤壁", "dynasty": "唐"},
{"text": "春蚕到死丝方尽,蜡炬成灰泪始干", "author": "李商隐", "source": "无题", "dynasty": "唐"},
{"text": "身无彩凤双飞翼,心有灵犀一点通", "author": "李商隐", "source": "无题", "dynasty": "唐"},
{"text": "夕阳无限好,只是近黄昏", "author": "李商隐", "source": "登乐游原", "dynasty": "唐"},
{"text": "先天下之忧而忧,后天下之乐而乐", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"},
{"text": "不以物喜,不以己悲", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"},
{"text": "醉翁之意不在酒,在乎山水之间也", "author": "欧阳修", "source": "醉翁亭记", "dynasty": "宋"},
{"text": "人生自是有情痴,此恨不关风与月", "author": "欧阳修", "source": "玉楼春", "dynasty": "宋"},
{"text": "衣带渐宽终不悔,为伊消得人憔悴", "author": "柳永", "source": "蝶恋花", "dynasty": "宋"},
{"text": "今宵酒醒何处?杨柳岸,晓风残月", "author": "柳永", "source": "雨霖铃", "dynasty": "宋"},
{"text": "但愿人长久,千里共婵娟", "author": "苏轼", "source": "水调歌头", "dynasty": "宋"},
{"text": "大江东去,浪淘尽,千古风流人物", "author": "苏轼", "source": "念奴娇·赤壁怀古", "dynasty": "宋"},
{"text": "竹杖芒鞋轻胜马,谁怕?一蓑烟雨任平生", "author": "苏轼", "source": "定风波", "dynasty": "宋"},
{"text": "回首向来萧瑟处,归去,也无风雨也无晴", "author": "苏轼", "source": "定风波", "dynasty": "宋"},
{"text": "十年生死两茫茫,不思量,自难忘", "author": "苏轼", "source": "江城子", "dynasty": "宋"},
{"text": "人生到处知何似,应似飞鸿踏雪泥", "author": "苏轼", "source": "和子由渑池怀旧", "dynasty": "宋"},
{"text": "不识庐山真面目,只缘身在此山中", "author": "苏轼", "source": "题西林壁", "dynasty": "宋"},
{"text": "两情若是久长时,又岂在朝朝暮暮", "author": "秦观", "source": "鹊桥仙", "dynasty": "宋"},
{"text": "此情无计可消除,才下眉头,却上心头", "author": "李清照", "source": "一剪梅", "dynasty": "宋"},
{"text": "寻寻觅觅,冷冷清清,凄凄惨惨戚戚", "author": "李清照", "source": "声声慢", "dynasty": "宋"},
{"text": "生当作人杰,死亦为鬼雄", "author": "李清照", "source": "夏日绝句", "dynasty": "宋"},
{"text": "三十功名尘与土,八千里路云和月", "author": "岳飞", "source": "满江红", "dynasty": "宋"},
{"text": "莫等闲,白了少年头,空悲切", "author": "岳飞", "source": "满江红", "dynasty": "宋"},
{"text": "壮志饥餐胡虏肉,笑谈渴饮匈奴血", "author": "岳飞", "source": "满江红", "dynasty": "宋"},
{"text": "山重水复疑无路,柳暗花明又一村", "author": "陆游", "source": "游山西村", "dynasty": "宋"},
{"text": "王师北定中原日,家祭无忘告乃翁", "author": "陆游", "source": "示儿", "dynasty": "宋"},
{"text": "小楼一夜听春雨,深巷明朝卖杏花", "author": "陆游", "source": "临安春雨初霁", "dynasty": "宋"},
{"text": "出师一表真名世,千载谁堪伯仲间", "author": "陆游", "source": "书愤", "dynasty": "宋"},
{"text": "人生自古谁无死,留取丹心照汗青", "author": "文天祥", "source": "过零丁洋", "dynasty": "宋"},
{"text": "臣心一片磁针石,不指南方不肯休", "author": "文天祥", "source": "扬子江", "dynasty": "宋"},
{"text": "问渠那得清如许,为有源头活水来", "author": "朱熹", "source": "观书有感", "dynasty": "宋"},
{"text": "等闲识得东风面,万紫千红总是春", "author": "朱熹", "source": "春日", "dynasty": "宋"},
{"text": "众里寻他千百度,蓦然回首,那人却在,灯火阑珊处", "author": "辛弃疾", "source": "青玉案·元夕", "dynasty": "宋"},
{"text": "想当年,金戈铁马,气吞万里如虎", "author": "辛弃疾", "source": "永遇乐·京口北固亭怀古", "dynasty": "宋"},
{"text": "醉里挑灯看剑,梦回吹角连营", "author": "辛弃疾", "source": "破阵子", "dynasty": "宋"},
{"text": "稻花香里说丰年,听取蛙声一片", "author": "辛弃疾", "source": "西江月·夜行黄沙道中", "dynasty": "宋"},
{"text": "春色满园关不住,一枝红杏出墙来", "author": "叶绍翁", "source": "游园不值", "dynasty": "宋"},
{"text": "落红不是无情物,化作春泥更护花", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"},
{"text": "我劝天公重抖擞,不拘一格降人才", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"},
{"text": "苟利国家生死以,岂因祸福避趋之", "author": "林则徐", "source": "赴戍登程口占示家人", "dynasty": "清"},
{"text": "海到无边天作岸,山登绝顶我为峰", "author": "林则徐", "source": "出老", "dynasty": "清"},
{"text": "我自横刀向天笑,去留肝胆两昆仑", "author": "谭嗣同", "source": "狱中题壁", "dynasty": "清"},
{"text": "横眉冷对千夫指,俯首甘为孺子牛", "author": "鲁迅", "source": "自嘲", "dynasty": "近代"},
{"text": "寄意寒星荃不察,我以我血荐轩辕", "author": "鲁迅", "source": "自题小像", "dynasty": "近代"},
{"text": "心事浩茫连广宇,于无声处听惊雷", "author": "鲁迅", "source": "无题", "dynasty": "近代"},
]
for i, item in enumerate(classic_texts):
quotes.append({
"id": f"classic_{i}",
"text": item["text"],
"author": item.get("author"),
"source": item.get("source"),
"dynasty": item.get("dynasty"),
"type": "名句",
"tags": [],
"emotion": []
})
return quotes
def main():
print("=" * 60)
print("名句数据收集")
print("=" * 60)
conn = create_database()
total_inserted = 0
print("\n[1/4] Fetching Tang Dynasty poems...")
tang_poems = fetch_chinese_poetry()
inserted = insert_quotes(conn, tang_poems)
total_inserted += inserted
print(f" Inserted {inserted} Tang poems")
print("\n[2/4] Fetching Song Dynasty poems...")
song_poems = fetch_song_poetry()
inserted = insert_quotes(conn, song_poems)
total_inserted += inserted
print(f" Inserted {inserted} Song poems")
print("\n[3/4] Fetching idioms...")
idioms = fetch_idioms()
inserted = insert_quotes(conn, idioms)
total_inserted += inserted
print(f" Inserted {inserted} idioms")
print("\n[4/4] Adding classic quotes...")
classics = fetch_classic_quotes()
inserted = insert_quotes(conn, classics)
total_inserted += inserted
print(f" Inserted {inserted} classic quotes")
conn.close()
print("\n" + "=" * 60)
print(f"Total inserted: {total_inserted} quotes")
print(f"Database saved to: {DB_PATH}")
print("=" * 60)
if __name__ == "__main__":
main()
|