Spaces:

ILRDF-Lowking
/

ILRDF-AI-Translator

Running

App Files Files Community

ILRDF-AI-Translator / knowledge_uploader.py

ILRDF-Lowking

Update knowledge_uploader.py

b1f49fc verified about 2 months ago

Raw

History Blame Contribute Delete

4.19 kB

	# knowledge_uploader.py
	import os
	import re
	import pandas as pd
	from pypdf import PdfReader
	from odf import text, teletype
	from odf.opendocument import load
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	import google.generativeai as genai
	from supabase import create_client
	import config

	# 1. 初始化連線
	genai.configure(api_key=config.GEMINI_KEY)
	supabase = create_client(config.SUPABASE_URL, config.SUPABASE_KEY)

	# 2. 設定文本切割器 (針對語法與文化語料)
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=600,
	chunk_overlap=100,
	separators=["\n\n", "\n", "。", "！", "？", " ", ""]
	)

	def get_embedding(text):
	"""呼叫 Google 向量模型 (768 維)"""
	try:
	result = genai.embed_content(
	model="models/text-embedding-004",
	content=text,
	task_type="retrieval_document"
	)
	return result['embedding']
	except Exception as e:
	print(f"❌ 向量化失敗: {e}")
	return None

	def parse_file_info(filename):
	"""
	💡 執行長建議的檔名解析邏輯
	格式範例: [01_Dict]_TRK_太魯閣語辭典.odt
	"""
	try:
	# 使用正則表達式抓取括號內的編號與語言代碼
	match = re.match(r"\[(\d+)_(\w+)\]_(\w+)_", filename)
	if match:
	code = match.group(1) # 01, 02...
	lang_tag = match.group(2) # TRK, SED...

	# 對應分類
	category_map = {
	"01": "Dict", # 詞彙庫
	"02": "Gram", # 語法規則
	"03": "Cult", # 文化語料
	"04": "Corp" # 一般語料
	}
	return lang_tag, category_map.get(code, "Other")
	except:
	pass
	return "Unknown", "Other"

	def process_file(file_path):
	"""根據副檔名選擇讀取方式"""
	filename = os.path.basename(file_path)
	tribe, category = parse_file_info(filename)

	print(f"🚀 處理中: {filename} (族語: {tribe}, 類別: {category})")

	content_list = []

	# --- 讀取 PDF ---
	if file_path.lower().endswith(".pdf"):
	reader = PdfReader(file_path)
	full_text = ""
	for page in reader.pages:
	full_text += page.extract_text() + "\n"
	content_list = text_splitter.split_text(full_text)

	# --- 讀取 ODT ---
	elif file_path.lower().endswith(".odt"):
	odt_doc = load(file_path)
	paragraphs = odt_doc.getElementsByType(text.P)
	full_text = "\n".join([teletype.extractText(p) for p in paragraphs])
	# 辭典類可以切細一點
	content_list = text_splitter.split_text(full_text)

	# --- 讀取 CSV (備用) ---
	elif file_path.lower().endswith(".csv"):
	df = pd.read_csv(file_path)
	content_list = df.apply(lambda x: " \| ".join(x.astype(str)), axis=1).tolist()

	# --- 執行上傳 ---
	success_count = 0
	for chunk in content_list:
	if not chunk.strip(): continue

	vector = get_embedding(chunk)
	if vector:
	data = {
	"tribe": tribe,
	"category": category,
	"content": chunk,
	"embedding": vector,
	"metadata": {"source": filename}
	}
	try:
	supabase.table("lang_knowledge").insert(data).execute()
	success_count += 1
	except Exception as e:
	print(f"⚠️ 資料庫寫入失敗: {e}")

	print(f"✅ {filename} 上傳完成，共 {success_count} 個區塊。")

	def start_ingestion(data_folder="data"):
	"""一鍵掃描資料夾並導入"""
	if not os.path.exists(data_folder):
	print(f"❌ 找不到資料夾: {data_folder}")
	return

	files = [f for f in os.listdir(data_folder) if not f.startswith(".")]
	print(f"📦 準備導入 {len(files)} 個檔案...")

	for file in files:
	process_file(os.path.join(data_folder, file))

	print("\n🎉 所有知識庫資料導入完畢！")

	if __name__ == "__main__":
	# 執行長只要執行這行即可
	start_ingestion("data")