# knowledge_uploader.py import os import re import pandas as pd from pypdf import PdfReader from odf import text, teletype from odf.opendocument import load from langchain_text_splitters import RecursiveCharacterTextSplitter import google.generativeai as genai from supabase import create_client import config # 1. 初始化連線 genai.configure(api_key=config.GEMINI_KEY) supabase = create_client(config.SUPABASE_URL, config.SUPABASE_KEY) # 2. 設定文本切割器 (針對語法與文化語料) text_splitter = RecursiveCharacterTextSplitter( chunk_size=600, chunk_overlap=100, separators=["\n\n", "\n", "。", "!", "?", " ", ""] ) def get_embedding(text): """呼叫 Google 向量模型 (768 維)""" try: result = genai.embed_content( model="models/text-embedding-004", content=text, task_type="retrieval_document" ) return result['embedding'] except Exception as e: print(f"❌ 向量化失敗: {e}") return None def parse_file_info(filename): """ 💡 執行長建議的檔名解析邏輯 格式範例: [01_Dict]_TRK_太魯閣語辭典.odt """ try: # 使用正則表達式抓取括號內的編號與語言代碼 match = re.match(r"\[(\d+)_(\w+)\]_(\w+)_", filename) if match: code = match.group(1) # 01, 02... lang_tag = match.group(2) # TRK, SED... # 對應分類 category_map = { "01": "Dict", # 詞彙庫 "02": "Gram", # 語法規則 "03": "Cult", # 文化語料 "04": "Corp" # 一般語料 } return lang_tag, category_map.get(code, "Other") except: pass return "Unknown", "Other" def process_file(file_path): """根據副檔名選擇讀取方式""" filename = os.path.basename(file_path) tribe, category = parse_file_info(filename) print(f"🚀 處理中: {filename} (族語: {tribe}, 類別: {category})") content_list = [] # --- 讀取 PDF --- if file_path.lower().endswith(".pdf"): reader = PdfReader(file_path) full_text = "" for page in reader.pages: full_text += page.extract_text() + "\n" content_list = text_splitter.split_text(full_text) # --- 讀取 ODT --- elif file_path.lower().endswith(".odt"): odt_doc = load(file_path) paragraphs = odt_doc.getElementsByType(text.P) full_text = "\n".join([teletype.extractText(p) for p in paragraphs]) # 辭典類可以切細一點 content_list = text_splitter.split_text(full_text) # --- 讀取 CSV (備用) --- elif file_path.lower().endswith(".csv"): df = pd.read_csv(file_path) content_list = df.apply(lambda x: " | ".join(x.astype(str)), axis=1).tolist() # --- 執行上傳 --- success_count = 0 for chunk in content_list: if not chunk.strip(): continue vector = get_embedding(chunk) if vector: data = { "tribe": tribe, "category": category, "content": chunk, "embedding": vector, "metadata": {"source": filename} } try: supabase.table("lang_knowledge").insert(data).execute() success_count += 1 except Exception as e: print(f"⚠️ 資料庫寫入失敗: {e}") print(f"✅ {filename} 上傳完成,共 {success_count} 個區塊。") def start_ingestion(data_folder="data"): """一鍵掃描資料夾並導入""" if not os.path.exists(data_folder): print(f"❌ 找不到資料夾: {data_folder}") return files = [f for f in os.listdir(data_folder) if not f.startswith(".")] print(f"📦 準備導入 {len(files)} 個檔案...") for file in files: process_file(os.path.join(data_folder, file)) print("\n🎉 所有知識庫資料導入完畢!") if __name__ == "__main__": # 執行長只要執行這行即可 start_ingestion("data")