Spaces:
Sleeping
Sleeping
File size: 4,628 Bytes
772ae76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# -------------------------------
# 1. 匯入套件
# -------------------------------
import os, glob, time
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatHuggingFaceHub
from langchain.chains import RetrievalQA
from docx import Document as DocxDocument
import gradio as gr
# -------------------------------
# 2. 設定路徑
# -------------------------------
txt_folder = "out_texts" # 放你的 .txt 檔
db_path = "faiss_db"
os.makedirs(db_path, exist_ok=True)
# -------------------------------
# 3. 建立 embeddings
# -------------------------------
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# -------------------------------
# 4. 建立或載入向量資料庫
# -------------------------------
if os.path.exists(os.path.join(db_path, "index.faiss")):
print("載入現有向量資料庫...")
db = FAISS.load_local(db_path, embeddings_model, allow_dangerous_deserialization=True)
else:
print("沒有資料庫,開始建立新向量資料庫...")
txt_files = glob.glob(f"{txt_folder}/*.txt")
docs = []
for filepath in txt_files:
with open(filepath, "r", encoding="utf-8") as f:
docs.append(Document(page_content=f.read(), metadata={"source": os.path.basename(filepath)}))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(docs)
print("產生向量嵌入中...")
db = FAISS.from_documents(split_docs, embeddings_model)
db.save_local(db_path)
print("向量資料庫建立完成。")
# -------------------------------
# 5. Hugging Face 模型設定
# -------------------------------
HUGGINGFACE_API_TOKEN = os.getenv("HF_TOKEN") # 建議在 Spaces Secrets 設定
MODEL_DICT = {
"google/flan-t5-large": 512,
"tiiuae/falcon-7b-instruct": 512
}
MAX_HOURLY_REQUESTS = 50
request_count = 0
last_reset_time = time.time()
# -------------------------------
# 6. RAG 主函式
# -------------------------------
def rag_generate_hfapi(query, model_name, segments=5, max_words=1500):
global request_count, last_reset_time
if time.time() - last_reset_time > 3600:
request_count = 0
last_reset_time = time.time()
if request_count >= MAX_HOURLY_REQUESTS:
return f"本小時生成次數已達上限 ({MAX_HOURLY_REQUESTS}),請稍後再試。", None
llm = ChatHuggingFaceHub(
repo_id=model_name,
model_kwargs={"temperature": 0.7, "max_new_tokens": MODEL_DICT[model_name]},
huggingfacehub_api_token=HUGGINGFACE_API_TOKEN
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
return_source_documents=True
)
prompt = f"""請依據下列主題生成一篇文章:
主題:{query}
需求:
- 總共 {segments} 段
- 每段約 {max_words // segments} 字
- 總字數請控制在 {max_words} 字以內
- 請自動分段輸出
"""
try:
result = qa_chain({"query": prompt})
full_text = result["result"].strip()
if not full_text:
full_text = "(生成失敗,請改用其他模型或調整段落數)"
except Exception as e:
return f"(生成失敗:{str(e)})", None
request_count += 1
paragraphs = [p.strip() for p in full_text.split("\n") if p.strip()]
docx_file = "generated_article.docx"
doc = DocxDocument()
doc.add_heading(query, level=1)
for p in paragraphs:
doc.add_paragraph(p)
doc.save(docx_file)
return "\n\n".join(paragraphs), docx_file
# -------------------------------
# 7. Gradio 介面
# -------------------------------
iface = gr.Interface(
fn=rag_generate_hfapi,
inputs=[
gr.Textbox(lines=2, placeholder="請輸入文章主題"),
gr.Dropdown(list(MODEL_DICT.keys()), value="google/flan-t5-large", label="選擇模型"),
gr.Slider(minimum=1, maximum=10, value=5, step=1, label="段落數"),
gr.Slider(minimum=500, maximum=3000, value=1500, step=100, label="文章字數上限")
],
outputs=[
gr.Textbox(label="生成文章"),
gr.File(label="下載 DOCX")
],
title="佛教經論 RAG 系統 (Hugging Face API)",
description="使用 Hugging Face API 生成文章,可選大模型,分段生成並下載 DOCX,每小時生成次數有限制"
)
iface.launch()
|