Alexend commited on
Commit
9d3d109
·
verified ·
1 Parent(s): a20b0e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -52
app.py CHANGED
@@ -1,29 +1,33 @@
1
- # ✅ app.py - Qwen + QA + web_data + 指定網站爬蟲 txt 優先設定
2
 
3
  import json
4
  import gradio as gr
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import torch
7
- import requests
8
- from bs4 import BeautifulSoup
9
 
10
- # ✅ 使用 Qwen 小模型
11
- model_name = "Qwen/Qwen1.5-0.5B-Chat"
12
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
14
- model.eval()
 
15
 
16
- # ✅ 載入 qa.json
17
- with open("qa.json", "r", encoding="utf-8") as f:
18
  qa_data = json.load(f)
19
 
20
- # ✅ 載入 web_data.txt
21
- with open("web_data.txt", "r", encoding="utf-8") as f:
22
- web_data = f.read()
23
 
24
- # ✅ 載入指定網站清單(txt 檔,每行一個)
25
- with open("trusted_sites.txt", "r", encoding="utf-8") as f:
26
- trusted_sites = [line.strip() for line in f if line.strip() and not line.startswith("#")]
 
 
 
27
 
28
  # ✅ QA 關鍵字比對
29
 
@@ -37,39 +41,27 @@ def retrieve_qa_context(user_input):
37
  return item["response"]
38
  return None
39
 
40
- # ✅ 網路爬蟲優先指定網站
41
- def search_web(query):
42
- headers = {"User-Agent": "Mozilla/5.0"}
43
- for site in trusted_sites:
44
- search_url = f"https://www.google.com/search?q=site:{site}+{query}"
45
- res = requests.get(search_url, headers=headers)
46
- soup = BeautifulSoup(res.text, 'html.parser')
47
- for a in soup.find_all("a", href=True):
48
- href = a["href"]
49
- if "url?q=" in href and site in href:
50
- url = href.split("url?q=")[-1].split("&")[0]
51
- try:
52
- page = requests.get(url, headers=headers, timeout=5)
53
- page.encoding = page.apparent_encoding
54
- text = BeautifulSoup(page.text, "html.parser").get_text(separator=" ", strip=True)
55
- return text[:1000]
56
- except:
57
- continue
58
- return "目前無法從指定網站取得有效資料。"
59
-
60
- # ✅ 回答生成
61
-
62
- def generate_answer_from_context(user_input, context):
63
  prompt = f"""
64
- 你是一位了解南臺科技大學的語音助理,請根據以下資料回答問題:
65
 
66
- [資料]
67
  {context}
68
 
69
  [問題]
70
  {user_input}
71
 
72
- 繁體中文簡短自然回答,不超過 90 字,回應明確有資訊,避免廢話與假設
73
  """
74
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
75
  outputs = model.generate(**inputs, max_new_tokens=150)
@@ -79,23 +71,23 @@ def generate_answer_from_context(user_input, context):
79
  return line.strip()
80
  return response[-90:]
81
 
82
- # ✅ 主流程
83
 
84
  def answer(user_input):
85
- context = retrieve_qa_context(user_input)
86
- if context:
87
- return context
88
  else:
89
- fallback_context = search_web(user_input)
90
- return generate_answer_from_context(user_input, fallback_context)
91
 
92
  # ✅ Gradio UI
93
  interface = gr.Interface(
94
  fn=answer,
95
- inputs=gr.Textbox(lines=2, placeholder="請問有關南臺科技大學的問題..."),
96
  outputs="text",
97
- title="南臺科技大學 問答機器人(Qwen + QA + 網路爬蟲)",
98
- description="優先使用 QA 資料本地知識若無結果,從指定網站清單中搜尋資料回答。",
99
  theme="default"
100
  )
101
 
 
1
+ # ✅ app.py - 向量式 RAG 主程式(整合 QA.json + FAISS 向量資料庫 + Qwen 生成)
2
 
3
  import json
4
  import gradio as gr
5
+ import faiss
6
  import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ from sentence_transformers import SentenceTransformer
9
 
10
+ # ✅ 檔案與模型設定
11
+ QA_FILE = "qa.json"
12
+ DOCS_FILE = "docs.json"
13
+ VECTOR_FILE = "faiss_index.faiss"
14
+ EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
15
+ GEN_MODEL = "Qwen/Qwen1.5-0.5B-Chat"
16
 
17
+ # ✅ 載入 QA 資料
18
+ with open(QA_FILE, "r", encoding="utf-8") as f:
19
  qa_data = json.load(f)
20
 
21
+ # ✅ 載入文件對應段落
22
+ with open(DOCS_FILE, "r", encoding="utf-8") as f:
23
+ docs = json.load(f)
24
 
25
+ # ✅ 載入向量庫與模型
26
+ index = faiss.read_index(VECTOR_FILE)
27
+ embedder = SentenceTransformer(EMBED_MODEL)
28
+ tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)
29
+ model = AutoModelForCausalLM.from_pretrained(GEN_MODEL, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
30
+ model.eval()
31
 
32
  # ✅ QA 關鍵字比對
33
 
 
41
  return item["response"]
42
  return None
43
 
44
+ # ✅ FAISS 向量檢索返回 top-k 最相關段落
45
+
46
+ def search_context_faiss(user_input, top_k=3):
47
+ vec = embedder.encode([user_input])
48
+ D, I = index.search(vec, top_k)
49
+ retrieved = [docs[i] for i in I[0] if i < len(docs)]
50
+ return "\n".join(retrieved)
51
+
52
+ # 用 Qwen 生成自然語言回答
53
+
54
+ def generate_answer(user_input, context):
 
 
 
 
 
 
 
 
 
 
 
 
55
  prompt = f"""
56
+ 你是一位了解南臺科技大學的親切語音助理,請根據以下資料回答使用者的問題:
57
 
58
+ [相關資料]
59
  {context}
60
 
61
  [問題]
62
  {user_input}
63
 
64
+ 繁體中文簡短自然回答,限制在 90 字,回應明確有資訊,不要含糊或重複問題
65
  """
66
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
67
  outputs = model.generate(**inputs, max_new_tokens=150)
 
71
  return line.strip()
72
  return response[-90:]
73
 
74
+ # ✅ 整合邏輯
75
 
76
  def answer(user_input):
77
+ direct = retrieve_qa_context(user_input)
78
+ if direct:
79
+ return direct
80
  else:
81
+ context = search_context_faiss(user_input)
82
+ return generate_answer(user_input, context)
83
 
84
  # ✅ Gradio UI
85
  interface = gr.Interface(
86
  fn=answer,
87
+ inputs=gr.Textbox(lines=2, placeholder="請輸入與南臺科技大學相關的問題..."),
88
  outputs="text",
89
+ title="南臺科技大學 問答機器人(向量式 RAG)",
90
+ description="結合 QA 關鍵字語意向量檢索提供然繁體回答。",
91
  theme="default"
92
  )
93