Spaces:
Running
Running
| import json | |
| import sqlite3 | |
| import requests | |
| import time | |
| from pathlib import Path | |
| DB_PATH = Path("data/quotes.db") | |
| def create_database(): | |
| DB_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| conn = sqlite3.connect(DB_PATH) | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS quotes ( | |
| id TEXT PRIMARY KEY, | |
| text TEXT NOT NULL, | |
| author TEXT, | |
| source TEXT, | |
| dynasty TEXT, | |
| type TEXT, | |
| tags TEXT, | |
| emotion TEXT | |
| ) | |
| """) | |
| cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON quotes(type)") | |
| cursor.execute("CREATE INDEX IF NOT EXISTS idx_dynasty ON quotes(dynasty)") | |
| conn.commit() | |
| return conn | |
| def insert_quotes(conn, quotes): | |
| cursor = conn.cursor() | |
| inserted = 0 | |
| for quote in quotes: | |
| try: | |
| cursor.execute(""" | |
| INSERT OR IGNORE INTO quotes (id, text, author, source, dynasty, type, tags, emotion) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?) | |
| """, ( | |
| quote['id'], | |
| quote['text'], | |
| quote.get('author'), | |
| quote.get('source'), | |
| quote.get('dynasty'), | |
| quote.get('type'), | |
| json.dumps(quote.get('tags', []), ensure_ascii=False), | |
| json.dumps(quote.get('emotion', []), ensure_ascii=False) | |
| )) | |
| if cursor.rowcount > 0: | |
| inserted += 1 | |
| except Exception as e: | |
| print(f"Error inserting {quote['id']}: {e}") | |
| conn.commit() | |
| return inserted | |
| def fetch_chinese_poetry(): | |
| poems = [] | |
| base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master" | |
| print("Fetching Tang Dynasty poems...") | |
| for i in range(0, 58000, 1000): | |
| url = f"{base_url}/poet/poet_{i}.json" | |
| try: | |
| response = requests.get(url, timeout=15) | |
| if response.status_code == 200: | |
| data = response.json() | |
| for poem in data: | |
| text = "".join(poem.get("paragraphs", [])) | |
| if len(text) >= 10: | |
| poems.append({ | |
| "id": f"tang_{poem.get('id', i)}", | |
| "text": text, | |
| "author": poem.get("author", ""), | |
| "source": poem.get("title", ""), | |
| "dynasty": "唐", | |
| "type": "诗词", | |
| "tags": [], | |
| "emotion": [] | |
| }) | |
| print(f" Fetched {len(data)} poems from batch {i}") | |
| time.sleep(0.5) | |
| except Exception as e: | |
| print(f" Error fetching batch {i}: {e}") | |
| return poems | |
| def fetch_song_poetry(): | |
| poems = [] | |
| base_url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master" | |
| print("Fetching Song Dynasty poems...") | |
| for i in range(0, 25000, 1000): | |
| url = f"{base_url}/poet/poet.song_{i}.json" | |
| try: | |
| response = requests.get(url, timeout=15) | |
| if response.status_code == 200: | |
| data = response.json() | |
| for poem in data: | |
| text = "".join(poem.get("paragraphs", [])) | |
| if len(text) >= 10: | |
| poems.append({ | |
| "id": f"song_{poem.get('id', i)}", | |
| "text": text, | |
| "author": poem.get("author", ""), | |
| "source": poem.get("title", ""), | |
| "dynasty": "宋", | |
| "type": "诗词", | |
| "tags": [], | |
| "emotion": [] | |
| }) | |
| print(f" Fetched {len(data)} poems from batch {i}") | |
| time.sleep(0.5) | |
| except Exception as e: | |
| print(f" Error fetching batch {i}: {e}") | |
| return poems | |
| def fetch_idioms(): | |
| idioms = [] | |
| url = "https://raw.githubusercontent.com/pwxcoo/chinese-xinhua/master/json/idioms.json" | |
| print("Fetching idioms...") | |
| try: | |
| response = requests.get(url, timeout=30) | |
| if response.status_code == 200: | |
| data = response.json() | |
| for item in data: | |
| word = item.get('word', '') | |
| if word and len(word) >= 3: | |
| idioms.append({ | |
| "id": f"idiom_{word}", | |
| "text": word, | |
| "author": None, | |
| "source": item.get('derivation', ''), | |
| "dynasty": None, | |
| "type": "成语", | |
| "tags": [], | |
| "emotion": [] | |
| }) | |
| print(f" Fetched {len(idioms)} idioms") | |
| except Exception as e: | |
| print(f" Error fetching idioms: {e}") | |
| return idioms | |
| def fetch_classic_quotes(): | |
| quotes = [] | |
| classic_texts = [ | |
| {"text": "学而时习之,不亦说乎", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "温故而知新,可以为师矣", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "学而不思则罔,思而不学则殆", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "知之为知之,不知为不知,是知也", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "己所不欲,勿施于人", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "三人行,必有我师焉", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "君子坦荡荡,小人长戚戚", "author": "孔子", "source": "论语", "dynasty": "春秋"}, | |
| {"text": "生于忧患,死于安乐", "author": "孟子", "source": "孟子", "dynasty": "战国"}, | |
| {"text": "得道多助,失道寡助", "author": "孟子", "source": "孟子", "dynasty": "战国"}, | |
| {"text": "富贵不能淫,贫贱不能移,威武不能屈", "author": "孟子", "source": "孟子", "dynasty": "战国"}, | |
| {"text": "天行健,君子以自强不息", "author": "佚名", "source": "周易", "dynasty": "先秦"}, | |
| {"text": "地势坤,君子以厚德载物", "author": "佚名", "source": "周易", "dynasty": "先秦"}, | |
| {"text": "路漫漫其修远兮,吾将上下而求索", "author": "屈原", "source": "离骚", "dynasty": "战国"}, | |
| {"text": "长太息以掩涕兮,哀民生之多艰", "author": "屈原", "source": "离骚", "dynasty": "战国"}, | |
| {"text": "亦余心之所善兮,虽九死其犹未悔", "author": "屈原", "source": "离骚", "dynasty": "战国"}, | |
| {"text": "老骥伏枥,志在千里", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"}, | |
| {"text": "烈士暮年,壮心不已", "author": "曹操", "source": "龟虽寿", "dynasty": "东汉"}, | |
| {"text": "山不厌高,海不厌深", "author": "曹操", "source": "短歌行", "dynasty": "东汉"}, | |
| {"text": "周公吐哺,天下归心", "author": "曹操", "source": "短歌行", "dynasty": "东汉"}, | |
| {"text": "捐躯赴国难,视死忽如归", "author": "曹植", "source": "白马篇", "dynasty": "三国"}, | |
| {"text": "本是同根生,相煎何太急", "author": "曹植", "source": "七步诗", "dynasty": "三国"}, | |
| {"text": "鞠躬尽瘁,死而后已", "author": "诸葛亮", "source": "后出师表", "dynasty": "三国"}, | |
| {"text": "非淡泊无以明志,非宁静无以致远", "author": "诸葛亮", "source": "诫子书", "dynasty": "三国"}, | |
| {"text": "采菊东篱下,悠然见南山", "author": "陶渊明", "source": "饮酒", "dynasty": "东晋"}, | |
| {"text": "羁鸟恋旧林,池鱼思故渊", "author": "陶渊明", "source": "归园田居", "dynasty": "东晋"}, | |
| {"text": "海内存知己,天涯若比邻", "author": "王勃", "source": "送杜少府之任蜀州", "dynasty": "唐"}, | |
| {"text": "落霞与孤鹜齐飞,秋水共长天一色", "author": "王勃", "source": "滕王阁序", "dynasty": "唐"}, | |
| {"text": "前不见古人,后不见来者", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"}, | |
| {"text": "念天地之悠悠,独怆然而涕下", "author": "陈子昂", "source": "登幽州台歌", "dynasty": "唐"}, | |
| {"text": "春江潮水连海平,海上明月共潮生", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"}, | |
| {"text": "人生代代无穷已,江月年年望相似", "author": "张若虚", "source": "春江花月夜", "dynasty": "唐"}, | |
| {"text": "欲穷千里目,更上一层楼", "author": "王之涣", "source": "登鹳雀楼", "dynasty": "唐"}, | |
| {"text": "黄河远上白云间,一片孤城万仞山", "author": "王之涣", "source": "凉州词", "dynasty": "唐"}, | |
| {"text": "独在异乡为异客,每逢佳节倍思亲", "author": "王维", "source": "九月九日忆山东兄弟", "dynasty": "唐"}, | |
| {"text": "劝君更尽一杯酒,西出阳关无故人", "author": "王维", "source": "送元二使安西", "dynasty": "唐"}, | |
| {"text": "大漠孤烟直,长河落日圆", "author": "王维", "source": "使至塞上", "dynasty": "唐"}, | |
| {"text": "明月松间照,清泉石上流", "author": "王维", "source": "山居秋暝", "dynasty": "唐"}, | |
| {"text": "天生我材必有用,千金散尽还复来", "author": "李白", "source": "将进酒", "dynasty": "唐"}, | |
| {"text": "长风破浪会有时,直挂云帆济沧海", "author": "李白", "source": "行路难", "dynasty": "唐"}, | |
| {"text": "举杯邀明月,对影成三人", "author": "李白", "source": "月下独酌", "dynasty": "唐"}, | |
| {"text": "抽刀断水水更流,举杯消愁愁更愁", "author": "李白", "source": "宣州谢朓楼饯别校书叔云", "dynasty": "唐"}, | |
| {"text": "安能摧眉折腰事权贵,使我不得开心颜", "author": "李白", "source": "梦游天姥吟留别", "dynasty": "唐"}, | |
| {"text": "会当凌绝顶,一览众山小", "author": "杜甫", "source": "望岳", "dynasty": "唐"}, | |
| {"text": "读书破万卷,下笔如有神", "author": "杜甫", "source": "奉赠韦左丞丈二十二韵", "dynasty": "唐"}, | |
| {"text": "出师未捷身先死,长使英雄泪满襟", "author": "杜甫", "source": "蜀相", "dynasty": "唐"}, | |
| {"text": "无边落木萧萧下,不尽长江滚滚来", "author": "杜甫", "source": "登高", "dynasty": "唐"}, | |
| {"text": "安得广厦千万间,大庇天下寒士俱欢颜", "author": "杜甫", "source": "茅屋为秋风所破歌", "dynasty": "唐"}, | |
| {"text": "忽如一夜春风来,千树万树梨花开", "author": "岑参", "source": "白雪歌送武判官归京", "dynasty": "唐"}, | |
| {"text": "沉舟侧畔千帆过,病树前头万木春", "author": "刘禹锡", "source": "酬乐天扬州初逢席上见赠", "dynasty": "唐"}, | |
| {"text": "旧时王谢堂前燕,飞入寻常百姓家", "author": "刘禹锡", "source": "乌衣巷", "dynasty": "唐"}, | |
| {"text": "千淘万漉虽辛苦,吹尽狂沙始到金", "author": "刘禹锡", "source": "浪淘沙", "dynasty": "唐"}, | |
| {"text": "野火烧不尽,春风吹又生", "author": "白居易", "source": "赋得古原草送别", "dynasty": "唐"}, | |
| {"text": "同是天涯沦落人,相逢何必曾相识", "author": "白居易", "source": "琵琶行", "dynasty": "唐"}, | |
| {"text": "在天愿作比翼鸟,在地愿为连理枝", "author": "白居易", "source": "长恨歌", "dynasty": "唐"}, | |
| {"text": "千呼万唤始出来,犹抱琵琶半遮面", "author": "白居易", "source": "琵琶行", "dynasty": "唐"}, | |
| {"text": "曾经沧海难为水,除却巫山不是云", "author": "元稹", "source": "离思", "dynasty": "唐"}, | |
| {"text": "黑云压城城欲摧,甲光向日金鳞开", "author": "李贺", "source": "雁门太守行", "dynasty": "唐"}, | |
| {"text": "男儿何不带吴钩,收取关山五十州", "author": "李贺", "source": "南园", "dynasty": "唐"}, | |
| {"text": "商女不知亡国恨,隔江犹唱后庭花", "author": "杜牧", "source": "泊秦淮", "dynasty": "唐"}, | |
| {"text": "停车坐爱枫林晚,霜叶红于二月花", "author": "杜牧", "source": "山行", "dynasty": "唐"}, | |
| {"text": "东风不与周郎便,铜雀春深锁二乔", "author": "杜牧", "source": "赤壁", "dynasty": "唐"}, | |
| {"text": "春蚕到死丝方尽,蜡炬成灰泪始干", "author": "李商隐", "source": "无题", "dynasty": "唐"}, | |
| {"text": "身无彩凤双飞翼,心有灵犀一点通", "author": "李商隐", "source": "无题", "dynasty": "唐"}, | |
| {"text": "夕阳无限好,只是近黄昏", "author": "李商隐", "source": "登乐游原", "dynasty": "唐"}, | |
| {"text": "先天下之忧而忧,后天下之乐而乐", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"}, | |
| {"text": "不以物喜,不以己悲", "author": "范仲淹", "source": "岳阳楼记", "dynasty": "宋"}, | |
| {"text": "醉翁之意不在酒,在乎山水之间也", "author": "欧阳修", "source": "醉翁亭记", "dynasty": "宋"}, | |
| {"text": "人生自是有情痴,此恨不关风与月", "author": "欧阳修", "source": "玉楼春", "dynasty": "宋"}, | |
| {"text": "衣带渐宽终不悔,为伊消得人憔悴", "author": "柳永", "source": "蝶恋花", "dynasty": "宋"}, | |
| {"text": "今宵酒醒何处?杨柳岸,晓风残月", "author": "柳永", "source": "雨霖铃", "dynasty": "宋"}, | |
| {"text": "但愿人长久,千里共婵娟", "author": "苏轼", "source": "水调歌头", "dynasty": "宋"}, | |
| {"text": "大江东去,浪淘尽,千古风流人物", "author": "苏轼", "source": "念奴娇·赤壁怀古", "dynasty": "宋"}, | |
| {"text": "竹杖芒鞋轻胜马,谁怕?一蓑烟雨任平生", "author": "苏轼", "source": "定风波", "dynasty": "宋"}, | |
| {"text": "回首向来萧瑟处,归去,也无风雨也无晴", "author": "苏轼", "source": "定风波", "dynasty": "宋"}, | |
| {"text": "十年生死两茫茫,不思量,自难忘", "author": "苏轼", "source": "江城子", "dynasty": "宋"}, | |
| {"text": "人生到处知何似,应似飞鸿踏雪泥", "author": "苏轼", "source": "和子由渑池怀旧", "dynasty": "宋"}, | |
| {"text": "不识庐山真面目,只缘身在此山中", "author": "苏轼", "source": "题西林壁", "dynasty": "宋"}, | |
| {"text": "两情若是久长时,又岂在朝朝暮暮", "author": "秦观", "source": "鹊桥仙", "dynasty": "宋"}, | |
| {"text": "此情无计可消除,才下眉头,却上心头", "author": "李清照", "source": "一剪梅", "dynasty": "宋"}, | |
| {"text": "寻寻觅觅,冷冷清清,凄凄惨惨戚戚", "author": "李清照", "source": "声声慢", "dynasty": "宋"}, | |
| {"text": "生当作人杰,死亦为鬼雄", "author": "李清照", "source": "夏日绝句", "dynasty": "宋"}, | |
| {"text": "三十功名尘与土,八千里路云和月", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, | |
| {"text": "莫等闲,白了少年头,空悲切", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, | |
| {"text": "壮志饥餐胡虏肉,笑谈渴饮匈奴血", "author": "岳飞", "source": "满江红", "dynasty": "宋"}, | |
| {"text": "山重水复疑无路,柳暗花明又一村", "author": "陆游", "source": "游山西村", "dynasty": "宋"}, | |
| {"text": "王师北定中原日,家祭无忘告乃翁", "author": "陆游", "source": "示儿", "dynasty": "宋"}, | |
| {"text": "小楼一夜听春雨,深巷明朝卖杏花", "author": "陆游", "source": "临安春雨初霁", "dynasty": "宋"}, | |
| {"text": "出师一表真名世,千载谁堪伯仲间", "author": "陆游", "source": "书愤", "dynasty": "宋"}, | |
| {"text": "人生自古谁无死,留取丹心照汗青", "author": "文天祥", "source": "过零丁洋", "dynasty": "宋"}, | |
| {"text": "臣心一片磁针石,不指南方不肯休", "author": "文天祥", "source": "扬子江", "dynasty": "宋"}, | |
| {"text": "问渠那得清如许,为有源头活水来", "author": "朱熹", "source": "观书有感", "dynasty": "宋"}, | |
| {"text": "等闲识得东风面,万紫千红总是春", "author": "朱熹", "source": "春日", "dynasty": "宋"}, | |
| {"text": "众里寻他千百度,蓦然回首,那人却在,灯火阑珊处", "author": "辛弃疾", "source": "青玉案·元夕", "dynasty": "宋"}, | |
| {"text": "想当年,金戈铁马,气吞万里如虎", "author": "辛弃疾", "source": "永遇乐·京口北固亭怀古", "dynasty": "宋"}, | |
| {"text": "醉里挑灯看剑,梦回吹角连营", "author": "辛弃疾", "source": "破阵子", "dynasty": "宋"}, | |
| {"text": "稻花香里说丰年,听取蛙声一片", "author": "辛弃疾", "source": "西江月·夜行黄沙道中", "dynasty": "宋"}, | |
| {"text": "春色满园关不住,一枝红杏出墙来", "author": "叶绍翁", "source": "游园不值", "dynasty": "宋"}, | |
| {"text": "落红不是无情物,化作春泥更护花", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"}, | |
| {"text": "我劝天公重抖擞,不拘一格降人才", "author": "龚自珍", "source": "己亥杂诗", "dynasty": "清"}, | |
| {"text": "苟利国家生死以,岂因祸福避趋之", "author": "林则徐", "source": "赴戍登程口占示家人", "dynasty": "清"}, | |
| {"text": "海到无边天作岸,山登绝顶我为峰", "author": "林则徐", "source": "出老", "dynasty": "清"}, | |
| {"text": "我自横刀向天笑,去留肝胆两昆仑", "author": "谭嗣同", "source": "狱中题壁", "dynasty": "清"}, | |
| {"text": "横眉冷对千夫指,俯首甘为孺子牛", "author": "鲁迅", "source": "自嘲", "dynasty": "近代"}, | |
| {"text": "寄意寒星荃不察,我以我血荐轩辕", "author": "鲁迅", "source": "自题小像", "dynasty": "近代"}, | |
| {"text": "心事浩茫连广宇,于无声处听惊雷", "author": "鲁迅", "source": "无题", "dynasty": "近代"}, | |
| ] | |
| for i, item in enumerate(classic_texts): | |
| quotes.append({ | |
| "id": f"classic_{i}", | |
| "text": item["text"], | |
| "author": item.get("author"), | |
| "source": item.get("source"), | |
| "dynasty": item.get("dynasty"), | |
| "type": "名句", | |
| "tags": [], | |
| "emotion": [] | |
| }) | |
| return quotes | |
| def main(): | |
| print("=" * 60) | |
| print("名句数据收集") | |
| print("=" * 60) | |
| conn = create_database() | |
| total_inserted = 0 | |
| print("\n[1/4] Fetching Tang Dynasty poems...") | |
| tang_poems = fetch_chinese_poetry() | |
| inserted = insert_quotes(conn, tang_poems) | |
| total_inserted += inserted | |
| print(f" Inserted {inserted} Tang poems") | |
| print("\n[2/4] Fetching Song Dynasty poems...") | |
| song_poems = fetch_song_poetry() | |
| inserted = insert_quotes(conn, song_poems) | |
| total_inserted += inserted | |
| print(f" Inserted {inserted} Song poems") | |
| print("\n[3/4] Fetching idioms...") | |
| idioms = fetch_idioms() | |
| inserted = insert_quotes(conn, idioms) | |
| total_inserted += inserted | |
| print(f" Inserted {inserted} idioms") | |
| print("\n[4/4] Adding classic quotes...") | |
| classics = fetch_classic_quotes() | |
| inserted = insert_quotes(conn, classics) | |
| total_inserted += inserted | |
| print(f" Inserted {inserted} classic quotes") | |
| conn.close() | |
| print("\n" + "=" * 60) | |
| print(f"Total inserted: {total_inserted} quotes") | |
| print(f"Database saved to: {DB_PATH}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |