Spaces:

dayuian
/

VocabLine

Sleeping

App Files Files Community

dayuian commited on Feb 18

Commit

dd2233b

verified ·

1 Parent(s): 1112df1

Update vocab.py

Browse files

Files changed (1) hide show

vocab.py +76 -31

vocab.py CHANGED Viewed

@@ -1,79 +1,124 @@
 import json
 import random
 import os
 import re
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# 初始化模型，只執行一次，避免每次請求都重新載入
 model_name = "EleutherAI/pythia-410m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
 DATA_DIR = "./data"
 def get_sources():
-    """掃描資料夾，回傳所有單字庫名稱"""
     files = os.listdir(DATA_DIR)
     sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
     return sources
 def clean_sentence(output):
-    """清理 GPT 生成的句子，去除雜訊"""
     output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
     output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
     if not output.endswith("."):
         output += "."
     return output
 def get_words_with_sentences(source, n):
-    """抽取單字 + 生成例句，回傳結果和狀態"""
     status = []
     display_result = ""
     try:
-        # 讀取單字庫資料
         data_path = os.path.join(DATA_DIR, f"{source}.json")
         with open(data_path, 'r', encoding='utf-8') as f:
             words = json.load(f)
-        # 隨機抽取
         selected_words = random.sample(words, n)
         results = []
         for i, word_data in enumerate(selected_words):
-            status.append(f"正在生成第 {i + 1}/{n} 個單字 [{word_data['word']}] 例句...")
             word = word_data['word']
-            # GPT 造句 Prompt
-            prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence."
-            inputs = tokenizer(prompt, return_tensors="pt")
-            outputs = model.generate(**inputs, max_new_tokens=30)
-            sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            clean_output = clean_sentence(sentence)
-            results.append({
-                "word": word,
-                "phonetic": word_data["phonetic"],
-                "sentence": clean_output
-            })
-            # 美化輸出文字
             display_result += f"""
             <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
                 <p><strong>📖 單字：</strong> {word}</p>
-                <p><strong>🔤 音標：</strong> {word_data['phonetic']}</p>
-                <p><strong>✍️ 例句：</strong> {clean_output}</p>
             </div>
             """
         status.append("✅ 完成！")
-        # 以HTML形式回傳美化後的結果
         return display_result, "\n".join(status)
     except Exception as e:
         status.append(f"❌ 發生錯誤: {str(e)}")
         return f"<p style='color:red;'>發生錯誤：{str(e)}</p>", "\n".join(status)

+import sqlite3
 import json
 import random
 import os
 import re
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# 初始化 GPT 模型
 model_name = "EleutherAI/pythia-410m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
+# 資料夾
 DATA_DIR = "./data"
+DB_PATH = os.path.join(DATA_DIR, "sentences.db")
+# 建立資料表
+def init_db():
+    conn = sqlite3.connect(DB_PATH)
+    c = conn.cursor()
+    c.execute('''
+    CREATE TABLE IF NOT EXISTS sentences (
+        word TEXT PRIMARY KEY,
+        phonetic TEXT,
+        sentence TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+    )
+    ''')
+    conn.commit()
+    conn.close()
+# 自動掃描資料夾生成選單
 def get_sources():
     files = os.listdir(DATA_DIR)
     sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
     return sources
+# 查詢句庫
+def get_sentence(word):
+    conn = sqlite3.connect(DB_PATH)
+    c = conn.cursor()
+    c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,))
+    result = c.fetchone()
+    conn.close()
+    return result
+# 保存句子到 SQLite
+def save_sentence(word, phonetic, sentence):
+    conn = sqlite3.connect(DB_PATH)
+    c = conn.cursor()
+    c.execute('''
+    INSERT INTO sentences (word, phonetic, sentence)
+    VALUES (?, ?, ?)
+    ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
+    ''', (word, phonetic, sentence))
+    conn.commit()
+    conn.close()
+# 清理 GPT 生成句子的雜訊
 def clean_sentence(output):
+    output = output.split(":")[-1].strip()
+    output = re.sub(r"^\d+\.\s*", "", output).strip()
     output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
     output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
     if not output.endswith("."):
         output += "."
     return output
+# 核心：抽單字 + 查句庫 or GPT 生成句子
 def get_words_with_sentences(source, n):
     status = []
     display_result = ""
     try:
+        # 讀取單字庫
         data_path = os.path.join(DATA_DIR, f"{source}.json")
         with open(data_path, 'r', encoding='utf-8') as f:
             words = json.load(f)
+        # 隨機抽取 n 個單字
         selected_words = random.sample(words, n)
         results = []
         for i, word_data in enumerate(selected_words):
             word = word_data['word']
+            phonetic = word_data['phonetic']
+            # 查詢句庫，看是否已有例句
+            cached_result = get_sentence(word)
+            if cached_result:
+                sentence = cached_result[2]
+                status.append(f"✅ {word} 已有例句，從句庫讀取")
+            else:
+                # 沒有的話，GPT 生成句子
+                status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
+                prompt = f"A simple English sentence with the word '{word}':"
+                inputs = tokenizer(prompt, return_tensors="pt")
+                outputs = model.generate(**inputs, max_new_tokens=30)
+                sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # 清理生成句子
+                sentence = clean_sentence(sentence)
+                # 存入句庫
+                save_sentence(word, phonetic, sentence)
+            # 美化輸出
             display_result += f"""
             <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
                 <p><strong>📖 單字：</strong> {word}</p>
+                <p><strong>🔤 音標：</strong> {phonetic}</p>
+                <p><strong>✍️ 例句：</strong> {sentence}</p>
             </div>
             """
         status.append("✅ 完成！")
         return display_result, "\n".join(status)
     except Exception as e:
         status.append(f"❌ 發生錯誤: {str(e)}")
         return f"<p style='color:red;'>發生錯誤：{str(e)}</p>", "\n".join(status)
+# 啟動時自動建表
+init_db()