import json import sqlite3 import requests import time from pathlib import Path DB_PATH = Path("data/quotes.db") def create_database(): DB_PATH.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS quotes ( id TEXT PRIMARY KEY, text TEXT NOT NULL, author TEXT, source TEXT, dynasty TEXT, type TEXT, tags TEXT, emotion TEXT ) """) cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON quotes(type)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_dynasty ON quotes(dynasty)") conn.commit() return conn def insert_quotes(conn, quotes): cursor = conn.cursor() inserted = 0 for quote in quotes: try: cursor.execute(""" INSERT OR IGNORE INTO quotes (id, text, author, source, dynasty, type, tags, emotion) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( quote['id'], quote['text'], quote.get('author'), quote.get('source'), quote.get('dynasty'), quote.get('type'), json.dumps(quote.get('tags', []), ensure_ascii=False), json.dumps(quote.get('emotion', []), ensure_ascii=False) )) if cursor.rowcount > 0: inserted += 1 except Exception as e: print(f"Error inserting {quote['id']}: {e}") conn.commit() return inserted def fetch_chinese_poetry(): poems = [] base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master" print("Fetching Tang Dynasty poems...") for i in range(0, 58000, 1000): url = f"{base_url}/poet/poet_{i}.json" try: response = requests.get(url, timeout=15) if response.status_code == 200: data = response.json() for poem in data: text = "".join(poem.get("paragraphs", [])) if len(text) >= 10: poems.append({ "id": f"tang_{poem.get('id', i)}", "text": text, "author": poem.get("author", ""), "source": poem.get("title", ""), "dynasty": "唐", "type": "诗词", "tags": [], "emotion": [] }) print(f" Fetched {len(data)} poems from batch {i}") time.sleep(0.5) except Exception as e: print(f" Error fetching batch {i}: {e}") return poems def fetch_song_poetry(): poems = [] base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master" print("Fetching Song Dynasty poems...") for i in range(0, 25000, 1000): url = f"{base_url}/poet/poet.song_{i}.json" try: response = requests.get(url, timeout=15) if response.status_code == 200: data = response.json() for poem in data: text = "".join(poem.get("paragraphs", [])) if len(text) >= 10: poems.append({ "id": f"song_{poem.get('id', i)}", "text": text, "author": poem.get("author", ""), "source": poem.get("title", ""), "dynasty": "宋", "type": "诗词", "tags": [], "emotion": [] }) print(f" Fetched {len(data)} poems from batch {i}") time.sleep(0.5) except Exception as e: print(f" Error fetching batch {i}: {e}") return poems def fetch_idioms(): idioms = [] url = "https://raw.githubusercontent.com/pwxcoo/chinese-xinhua/master/json/idioms.json" print("Fetching idioms...") try: response = requests.get(url, timeout=30) if response.status_code == 200: data = response.json() for item in data: word = item.get('word', '') if word and len(word) >= 3: idioms.append({ "id": f"idiom_{word}", "text": word, "author": None, "source": item.get('derivation', ''), "dynasty": None, "type": "成语", "tags": [], "emotion": [] }) print(f" Fetched {len(idioms)} idioms") except Exception as e: print(f" Error fetching idioms: {e}") return idioms def fetch_classic_quotes(): quotes = [] classic_texts = [ {"text": "学而时习之,不亦说乎", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "温故而知新,可以为师矣", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "学而不思则罔,思而不学则殆", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "知之为知之,不知为不知,是知也", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "己所不欲,勿施于人", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "三人行,必有我师焉", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "君子坦荡荡,小人长戚戚", "author": "孔子", "source": "论语", "dynasty": "春秋"}, {"text": "生于忧患,死于安乐", "author": "孟子", "source": "孟子", "dynasty": "战国"}, {"text": "得道多助,失道寡助", "author": "孟子", "source": "孟子", "dynasty": "战国"}, {"text": "富贵不能淫,贫贱不能移,威武不能屈", "author": "孟子", "source": "孟子", "dynasty": "战国"}, {"text": "天行健,君子以自强不息", "author": "佚名", "source": "周易", "dynasty": "先秦"}, {"text": "地势坤,君子以厚德载物", "author": "佚名", "source": "周易", "dynasty": "先秦"}, {"text": "路漫漫其修远兮,吾将上下而求索", "author": "屈原", "source": "离骚", "dynasty": "战国"}, {"text": "长太息以掩涕兮,哀民生之多艰", "author": "屈原", "source": "离骚", "dynasty": "战国"}, {"text": "亦余心之所善兮,虽九死其犹未悔", "author": "屈原", "source": "离骚", "dynasty": "战国"}, {"text": "老骥伏枥,志在千里", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"}, {"text": "烈士暮年,壮心不已", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"}, {"text": "山不厌高,海不厌深", "author": "曹操", "source": "短歌行", "dynasty": "东汉"}, {"text": "周公吐哺,天下归心", "author": "曹操", "source": "短歌行", "dynasty": "东汉"}, {"text": "捐躯赴国难,视死忽如归", "author": "曹植", "source": "白马篇", "dynasty": "三国"}, {"text": "本是同根生,相煎何太急", "author": "曹植", "source": "七步诗", "dynasty": "三国"}, {"text": "鞠躬尽瘁,死而后已", "author": "诸葛亮", "source": "后出师表", "dynasty": "三国"}, {"text": "非淡泊无以明志,非宁静无以致远", "author": "诸葛亮", "source": "诫子书", "dynasty": "三国"}, {"text": "采菊东篱下,悠然见南山", "author": "陶渊明", "source": "饮酒", "dynasty": "东晋"}, {"text": "羁鸟恋旧林,池鱼思故渊", "author": "陶渊明", "source": "归园田居", "dynasty": "东晋"}, {"text": "海内存知己,天涯若比邻", "author": "王勃", "source": "送杜少府之任蜀州", "dynasty": "唐"}, {"text": "落霞与孤鹜齐飞,秋水共长天一色", "author": "王勃", "source": "滕王阁序", "dynasty": "唐"}, {"text": "前不见古人,后不见来者", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"}, {"text": "念天地之悠悠,独怆然而涕下", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"}, {"text": "春江潮水连海平,海上明月共潮生", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"}, {"text": "人生代代无穷已,江月年年望相似", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"}, {"text": "欲穷千里目,更上一层楼", "author": "王之涣", "source": "登鹳雀楼", "dynasty": "唐"}, {"text": "黄河远上白云间,一片孤城万仞山", "author": "王之涣", "source": "凉州词", "dynasty": "唐"}, {"text": "独在异乡为异客,每逢佳节倍思亲", "author": "王维", "source": "九月九日忆山东兄弟", "dynasty": "唐"}, {"text": "劝君更尽一杯酒,西出阳关无故人", "author": "王维", "source": "送元二使安西", "dynasty": "唐"}, {"text": "大漠孤烟直,长河落日圆", "author": "王维", "source": "使至塞上", "dynasty": "唐"}, {"text": "明月松间照,清泉石上流", "author": "王维", "source": "山居秋暝", "dynasty": "唐"}, {"text": "天生我材必有用,千金散尽还复来", "author": "李白", "source": "将进酒", "dynasty": "唐"}, {"text": "长风破浪会有时,直挂云帆济沧海", "author": "李白", "source": "行路难", "dynasty": "唐"}, {"text": "举杯邀明月,对影成三人", "author": "李白", "source": "月下独酌", "dynasty": "唐"}, {"text": "抽刀断水水更流,举杯消愁愁更愁", "author": "李白", "source": "宣州谢朓楼饯别校书叔云", "dynasty": "唐"}, {"text": "安能摧眉折腰事权贵,使我不得开心颜", "author": "李白", "source": "梦游天姥吟留别", "dynasty": "唐"}, {"text": "会当凌绝顶,一览众山小", "author": "杜甫", "source": "望岳", "dynasty": "唐"}, {"text": "读书破万卷,下笔如有神", "author": "杜甫", "source": "奉赠韦左丞丈二十二韵", "dynasty": "唐"}, {"text": "出师未捷身先死,长使英雄泪满襟", "author": "杜甫", "source": "蜀相", "dynasty": "唐"}, {"text": "无边落木萧萧下,不尽长江滚滚来", "author": "杜甫", "source": "登高", "dynasty": "唐"}, {"text": "安得广厦千万间,大庇天下寒士俱欢颜", "author": "杜甫", "source": "茅屋为秋风所破歌", "dynasty": "唐"}, {"text": "忽如一夜春风来,千树万树梨花开", "author": "岑参", "source": "白雪歌送武判官归京", "dynasty": "唐"}, {"text": "沉舟侧畔千帆过,病树前头万木春", "author": "刘禹锡", "source": "酬乐天扬州初逢席上见赠", "dynasty": "唐"}, {"text": "旧时王谢堂前燕,飞入寻常百姓家", "author": "刘禹锡", "source": "乌衣巷", "dynasty": "唐"}, {"text": "千淘万漉虽辛苦,吹尽狂沙始到金", "author": "刘禹锡", "source": "浪淘沙", "dynasty": "唐"}, {"text": "野火烧不尽,春风吹又生", "author": "白居易", "source": "赋得古原草送别", "dynasty": "唐"}, {"text": "同是天涯沦落人,相逢何必曾相识", "author": "白居易", "source": "琵琶行", "dynasty": "唐"}, {"text": "在天愿作比翼鸟,在地愿为连理枝", "author": "白居易", "source": "长恨歌", "dynasty": "唐"}, {"text": "千呼万唤始出来,犹抱琵琶半遮面", "author": "白居易", "source": "琵琶行", "dynasty": "唐"}, {"text": "曾经沧海难为水,除却巫山不是云", "author": "元稹", "source": "离思", "dynasty": "唐"}, {"text": "黑云压城城欲摧,甲光向日金鳞开", "author": "李贺", "source": "雁门太守行", "dynasty": "唐"}, {"text": "男儿何不带吴钩,收取关山五十州", "author": "李贺", "source": "南园", "dynasty": "唐"}, {"text": "商女不知亡国恨,隔江犹唱后庭花", "author": "杜牧", "source": "泊秦淮", "dynasty": "唐"}, {"text": "停车坐爱枫林晚,霜叶红于二月花", "author": "杜牧", "source": "山行", "dynasty": "唐"}, {"text": "东风不与周郎便,铜雀春深锁二乔", "author": "杜牧", "source": "赤壁", "dynasty": "唐"}, {"text": "春蚕到死丝方尽,蜡炬成灰泪始干", "author": "李商隐", "source": "无题", "dynasty": "唐"}, {"text": "身无彩凤双飞翼,心有灵犀一点通", "author": "李商隐", "source": "无题", "dynasty": "唐"}, {"text": "夕阳无限好,只是近黄昏", "author": "李商隐", "source": "登乐游原", "dynasty": "唐"}, {"text": "先天下之忧而忧,后天下之乐而乐", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"}, {"text": "不以物喜,不以己悲", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"}, {"text": "醉翁之意不在酒,在乎山水之间也", "author": "欧阳修", "source": "醉翁亭记", "dynasty": "宋"}, {"text": "人生自是有情痴,此恨不关风与月", "author": "欧阳修", "source": "玉楼春", "dynasty": "宋"}, {"text": "衣带渐宽终不悔,为伊消得人憔悴", "author": "柳永", "source": "蝶恋花", "dynasty": "宋"}, {"text": "今宵酒醒何处?杨柳岸,晓风残月", "author": "柳永", "source": "雨霖铃", "dynasty": "宋"}, {"text": "但愿人长久,千里共婵娟", "author": "苏轼", "source": "水调歌头", "dynasty": "宋"}, {"text": "大江东去,浪淘尽,千古风流人物", "author": "苏轼", "source": "念奴娇·赤壁怀古", "dynasty": "宋"}, {"text": "竹杖芒鞋轻胜马,谁怕?一蓑烟雨任平生", "author": "苏轼", "source": "定风波", "dynasty": "宋"}, {"text": "回首向来萧瑟处,归去,也无风雨也无晴", "author": "苏轼", "source": "定风波", "dynasty": "宋"}, {"text": "十年生死两茫茫,不思量,自难忘", "author": "苏轼", "source": "江城子", "dynasty": "宋"}, {"text": "人生到处知何似,应似飞鸿踏雪泥", "author": "苏轼", "source": "和子由渑池怀旧", "dynasty": "宋"}, {"text": "不识庐山真面目,只缘身在此山中", "author": "苏轼", "source": "题西林壁", "dynasty": "宋"}, {"text": "两情若是久长时,又岂在朝朝暮暮", "author": "秦观", "source": "鹊桥仙", "dynasty": "宋"}, {"text": "此情无计可消除,才下眉头,却上心头", "author": "李清照", "source": "一剪梅", "dynasty": "宋"}, {"text": "寻寻觅觅,冷冷清清,凄凄惨惨戚戚", "author": "李清照", "source": "声声慢", "dynasty": "宋"}, {"text": "生当作人杰,死亦为鬼雄", "author": "李清照", "source": "夏日绝句", "dynasty": "宋"}, {"text": "三十功名尘与土,八千里路云和月", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, {"text": "莫等闲,白了少年头,空悲切", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, {"text": "壮志饥餐胡虏肉,笑谈渴饮匈奴血", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, {"text": "山重水复疑无路,柳暗花明又一村", "author": "陆游", "source": "游山西村", "dynasty": "宋"}, {"text": "王师北定中原日,家祭无忘告乃翁", "author": "陆游", "source": "示儿", "dynasty": "宋"}, {"text": "小楼一夜听春雨,深巷明朝卖杏花", "author": "陆游", "source": "临安春雨初霁", "dynasty": "宋"}, {"text": "出师一表真名世,千载谁堪伯仲间", "author": "陆游", "source": "书愤", "dynasty": "宋"}, {"text": "人生自古谁无死,留取丹心照汗青", "author": "文天祥", "source": "过零丁洋", "dynasty": "宋"}, {"text": "臣心一片磁针石,不指南方不肯休", "author": "文天祥", "source": "扬子江", "dynasty": "宋"}, {"text": "问渠那得清如许,为有源头活水来", "author": "朱熹", "source": "观书有感", "dynasty": "宋"}, {"text": "等闲识得东风面,万紫千红总是春", "author": "朱熹", "source": "春日", "dynasty": "宋"}, {"text": "众里寻他千百度,蓦然回首,那人却在,灯火阑珊处", "author": "辛弃疾", "source": "青玉案·元夕", "dynasty": "宋"}, {"text": "想当年,金戈铁马,气吞万里如虎", "author": "辛弃疾", "source": "永遇乐·京口北固亭怀古", "dynasty": "宋"}, {"text": "醉里挑灯看剑,梦回吹角连营", "author": "辛弃疾", "source": "破阵子", "dynasty": "宋"}, {"text": "稻花香里说丰年,听取蛙声一片", "author": "辛弃疾", "source": "西江月·夜行黄沙道中", "dynasty": "宋"}, {"text": "春色满园关不住,一枝红杏出墙来", "author": "叶绍翁", "source": "游园不值", "dynasty": "宋"}, {"text": "落红不是无情物,化作春泥更护花", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"}, {"text": "我劝天公重抖擞,不拘一格降人才", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"}, {"text": "苟利国家生死以,岂因祸福避趋之", "author": "林则徐", "source": "赴戍登程口占示家人", "dynasty": "清"}, {"text": "海到无边天作岸,山登绝顶我为峰", "author": "林则徐", "source": "出老", "dynasty": "清"}, {"text": "我自横刀向天笑,去留肝胆两昆仑", "author": "谭嗣同", "source": "狱中题壁", "dynasty": "清"}, {"text": "横眉冷对千夫指,俯首甘为孺子牛", "author": "鲁迅", "source": "自嘲", "dynasty": "近代"}, {"text": "寄意寒星荃不察,我以我血荐轩辕", "author": "鲁迅", "source": "自题小像", "dynasty": "近代"}, {"text": "心事浩茫连广宇,于无声处听惊雷", "author": "鲁迅", "source": "无题", "dynasty": "近代"}, ] for i, item in enumerate(classic_texts): quotes.append({ "id": f"classic_{i}", "text": item["text"], "author": item.get("author"), "source": item.get("source"), "dynasty": item.get("dynasty"), "type": "名句", "tags": [], "emotion": [] }) return quotes def main(): print("=" * 60) print("名句数据收集") print("=" * 60) conn = create_database() total_inserted = 0 print("\n[1/4] Fetching Tang Dynasty poems...") tang_poems = fetch_chinese_poetry() inserted = insert_quotes(conn, tang_poems) total_inserted += inserted print(f" Inserted {inserted} Tang poems") print("\n[2/4] Fetching Song Dynasty poems...") song_poems = fetch_song_poetry() inserted = insert_quotes(conn, song_poems) total_inserted += inserted print(f" Inserted {inserted} Song poems") print("\n[3/4] Fetching idioms...") idioms = fetch_idioms() inserted = insert_quotes(conn, idioms) total_inserted += inserted print(f" Inserted {inserted} idioms") print("\n[4/4] Adding classic quotes...") classics = fetch_classic_quotes() inserted = insert_quotes(conn, classics) total_inserted += inserted print(f" Inserted {inserted} classic quotes") conn.close() print("\n" + "=" * 60) print(f"Total inserted: {total_inserted} quotes") print(f"Database saved to: {DB_PATH}") print("=" * 60) if __name__ == "__main__": main()