Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,9 +11,14 @@ import re
|
|
| 11 |
# --- 配置區 ---
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 13 |
DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
print("--- [1/5] 開始初始化應用 ---")
|
| 19 |
|
|
@@ -52,7 +57,6 @@ try:
|
|
| 52 |
sql_answers.append(sql_query)
|
| 53 |
|
| 54 |
except Exception as e:
|
| 55 |
-
print(f"解析錯誤,跳過該條目: {e}")
|
| 56 |
continue
|
| 57 |
|
| 58 |
print(f"--- > 成功解析 {len(questions)} 條問答範例 ---")
|
|
@@ -73,7 +77,7 @@ try:
|
|
| 73 |
except Exception as e:
|
| 74 |
print(f"錯誤: 載入數據集失敗: {e}")
|
| 75 |
questions = ["示例問題"]
|
| 76 |
-
sql_answers = ["SELECT '
|
| 77 |
|
| 78 |
# --- 2. 初始化檢索模型 ---
|
| 79 |
print("--- [3/5] 正在載入句向量模型... ---")
|
|
@@ -82,7 +86,7 @@ embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
|
| 82 |
# 計算問題向量
|
| 83 |
if questions:
|
| 84 |
print(f"--- [4/5] 正在為 {len(questions)} 個問題計算向量... ---")
|
| 85 |
-
question_embeddings = embedder.encode(questions, convert_to_tensor=True, show_progress_bar=
|
| 86 |
print("--- > 向量計算完成! ---")
|
| 87 |
else:
|
| 88 |
print("--- [4/5] 警告:沒有可用的問題 ---")
|
|
@@ -106,29 +110,57 @@ def build_schema_context(schema_dict):
|
|
| 106 |
|
| 107 |
SCHEMA_CONTEXT = build_schema_context(schema_data)
|
| 108 |
|
| 109 |
-
# --- 4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def get_sql_query(user_question: str):
|
| 111 |
if not user_question:
|
| 112 |
return "請輸入您的問題。", "日誌:用戶未輸入問題。"
|
| 113 |
|
| 114 |
log_messages = []
|
| 115 |
|
| 116 |
-
#
|
| 117 |
if len(questions) > 0 and len(question_embeddings) > 0:
|
| 118 |
try:
|
| 119 |
question_embedding = embedder.encode(user_question, convert_to_tensor=True)
|
| 120 |
hits = util.semantic_search(question_embedding, question_embeddings, top_k=3)
|
| 121 |
|
| 122 |
if hits and hits[0]:
|
| 123 |
-
|
| 124 |
-
similarity_score =
|
| 125 |
-
similar_question = questions[
|
| 126 |
|
| 127 |
log_messages.append(f"檢索到相似問題: '{similar_question}' (相似度: {similarity_score:.3f})")
|
| 128 |
|
| 129 |
if similarity_score > SIMILARITY_THRESHOLD:
|
| 130 |
-
sql_result = sql_answers[
|
| 131 |
-
log_messages.append(f"相似度 > {SIMILARITY_THRESHOLD}
|
| 132 |
return sql_result, "\n".join(log_messages)
|
| 133 |
else:
|
| 134 |
log_messages.append(f"相似度低於閾值 {SIMILARITY_THRESHOLD}")
|
|
@@ -137,86 +169,71 @@ def get_sql_query(user_question: str):
|
|
| 137 |
|
| 138 |
except Exception as e:
|
| 139 |
log_messages.append(f"檢索過程出錯: {e}")
|
| 140 |
-
else:
|
| 141 |
-
log_messages.append("知識庫為空,跳過檢索")
|
| 142 |
|
| 143 |
-
# LLM
|
| 144 |
-
log_messages.append("
|
| 145 |
|
| 146 |
-
#
|
| 147 |
-
prompt = f"""
|
| 148 |
|
|
|
|
| 149 |
{SCHEMA_CONTEXT}
|
| 150 |
|
| 151 |
問題:{user_question}
|
| 152 |
|
| 153 |
-
|
| 154 |
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
"
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
}
|
| 166 |
-
|
| 167 |
-
try:
|
| 168 |
-
response = requests.post(LLM_API_URL, headers=headers, json=payload, timeout=30)
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
log_messages.append("LLM生成成功!")
|
| 179 |
-
return generated_text, "\n".join(log_messages)
|
| 180 |
else:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
except Exception as e:
|
| 184 |
-
error_msg = f"LLM API調用失敗: {str(e)}"
|
| 185 |
-
log_messages.append(error_msg)
|
| 186 |
|
| 187 |
-
# 提供更有用的備用答案
|
| 188 |
-
backup_sql = "SELECT 'AI服務暫時不可用,請稍後再試或聯繫管理員' AS status;"
|
| 189 |
return backup_sql, "\n".join(log_messages)
|
| 190 |
|
| 191 |
-
# ---
|
| 192 |
print("--- [5/5] 正在創建 Web 界面... ---")
|
| 193 |
|
| 194 |
with gr.Blocks(title="智能Text-to-SQL系統") as demo:
|
| 195 |
-
gr.Markdown("#
|
| 196 |
gr.Markdown("輸入自然語言問題,自動生成SQL查詢")
|
| 197 |
|
| 198 |
with gr.Row():
|
| 199 |
question_input = gr.Textbox(
|
| 200 |
label="您的問題",
|
| 201 |
-
placeholder="
|
| 202 |
lines=2
|
| 203 |
)
|
| 204 |
|
| 205 |
with gr.Row():
|
| 206 |
submit_btn = gr.Button("生成SQL", variant="primary")
|
| 207 |
-
clear_btn = gr.Button("清除")
|
| 208 |
|
| 209 |
with gr.Row():
|
| 210 |
sql_output = gr.Code(
|
| 211 |
label="生成的SQL",
|
| 212 |
language="sql",
|
| 213 |
-
lines=
|
| 214 |
)
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
log_output = gr.Textbox(
|
| 218 |
label="執行日誌",
|
| 219 |
-
lines=
|
| 220 |
interactive=False
|
| 221 |
)
|
| 222 |
|
|
@@ -227,23 +244,18 @@ with gr.Blocks(title="智能Text-to-SQL系統") as demo:
|
|
| 227 |
outputs=[sql_output, log_output]
|
| 228 |
)
|
| 229 |
|
| 230 |
-
clear_btn.click(
|
| 231 |
-
fn=lambda: ["", ""],
|
| 232 |
-
inputs=[],
|
| 233 |
-
outputs=[sql_output, log_output]
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
# 示例
|
| 237 |
gr.Examples(
|
| 238 |
examples=[
|
| 239 |
-
"
|
| 240 |
-
"
|
| 241 |
-
"
|
| 242 |
-
"
|
| 243 |
],
|
| 244 |
inputs=question_input
|
| 245 |
)
|
| 246 |
|
| 247 |
print("--- 應用啟動完成 ---")
|
|
|
|
| 248 |
if __name__ == "__main__":
|
| 249 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 11 |
# --- 配置區 ---
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 13 |
DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
|
| 14 |
+
SIMILARITY_THRESHOLD = 0.75 # 進一步降低閾值
|
| 15 |
+
|
| 16 |
+
# 多個備用模型(保證至少有一個可用)
|
| 17 |
+
LLM_MODELS = [
|
| 18 |
+
"https://api-inference.huggingface.co/models/gpt2", # 最基礎的模型,保證可用
|
| 19 |
+
"https://api-inference.huggingface.co/models/distilgpt2",
|
| 20 |
+
"https://api-inference.huggingface.co/models/microsoft/DialoGPT-small"
|
| 21 |
+
]
|
| 22 |
|
| 23 |
print("--- [1/5] 開始初始化應用 ---")
|
| 24 |
|
|
|
|
| 57 |
sql_answers.append(sql_query)
|
| 58 |
|
| 59 |
except Exception as e:
|
|
|
|
| 60 |
continue
|
| 61 |
|
| 62 |
print(f"--- > 成功解析 {len(questions)} 條問答範例 ---")
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
print(f"錯誤: 載入數據集失敗: {e}")
|
| 79 |
questions = ["示例問題"]
|
| 80 |
+
sql_answers = ["SELECT '系統就緒' AS status;"]
|
| 81 |
|
| 82 |
# --- 2. 初始化檢索模型 ---
|
| 83 |
print("--- [3/5] 正在載入句向量模型... ---")
|
|
|
|
| 86 |
# 計算問題向量
|
| 87 |
if questions:
|
| 88 |
print(f"--- [4/5] 正在為 {len(questions)} 個問題計算向量... ---")
|
| 89 |
+
question_embeddings = embedder.encode(questions, convert_to_tensor=True, show_progress_bar=False) # 關閉進度條
|
| 90 |
print("--- > 向量計算完成! ---")
|
| 91 |
else:
|
| 92 |
print("--- [4/5] 警告:沒有可用的問題 ---")
|
|
|
|
| 110 |
|
| 111 |
SCHEMA_CONTEXT = build_schema_context(schema_data)
|
| 112 |
|
| 113 |
+
# --- 4. LLM調用函數(多模型備用)---
|
| 114 |
+
def call_llm_api(prompt, model_urls=LLM_MODELS):
|
| 115 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 116 |
+
payload = {
|
| 117 |
+
"inputs": prompt,
|
| 118 |
+
"parameters": {
|
| 119 |
+
"max_new_tokens": 150,
|
| 120 |
+
"temperature": 0.1,
|
| 121 |
+
"do_sample": False
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
# 嘗試所有備用模型
|
| 126 |
+
for model_url in model_urls:
|
| 127 |
+
try:
|
| 128 |
+
response = requests.post(model_url, headers=headers, json=payload, timeout=15)
|
| 129 |
+
|
| 130 |
+
if response.status_code == 200:
|
| 131 |
+
result = response.json()
|
| 132 |
+
if isinstance(result, list) and len(result) > 0:
|
| 133 |
+
return result[0]['generated_text'].strip()
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"模型 {model_url} 失敗: {e}")
|
| 137 |
+
continue
|
| 138 |
+
|
| 139 |
+
return None # 所有模型都失敗
|
| 140 |
+
|
| 141 |
+
# --- 5. 核心邏輯 ---
|
| 142 |
def get_sql_query(user_question: str):
|
| 143 |
if not user_question:
|
| 144 |
return "請輸入您的問題。", "日誌:用戶未輸入問題。"
|
| 145 |
|
| 146 |
log_messages = []
|
| 147 |
|
| 148 |
+
# 1. 首先嘗試檢索
|
| 149 |
if len(questions) > 0 and len(question_embeddings) > 0:
|
| 150 |
try:
|
| 151 |
question_embedding = embedder.encode(user_question, convert_to_tensor=True)
|
| 152 |
hits = util.semantic_search(question_embedding, question_embeddings, top_k=3)
|
| 153 |
|
| 154 |
if hits and hits[0]:
|
| 155 |
+
best_hit = hits[0][0]
|
| 156 |
+
similarity_score = best_hit['score']
|
| 157 |
+
similar_question = questions[best_hit['corpus_id']]
|
| 158 |
|
| 159 |
log_messages.append(f"檢索到相似問題: '{similar_question}' (相似度: {similarity_score:.3f})")
|
| 160 |
|
| 161 |
if similarity_score > SIMILARITY_THRESHOLD:
|
| 162 |
+
sql_result = sql_answers[best_hit['corpus_id']]
|
| 163 |
+
log_messages.append(f"相似度 > {SIMILARITY_THRESHOLD},直接返回")
|
| 164 |
return sql_result, "\n".join(log_messages)
|
| 165 |
else:
|
| 166 |
log_messages.append(f"相似度低於閾值 {SIMILARITY_THRESHOLD}")
|
|
|
|
| 169 |
|
| 170 |
except Exception as e:
|
| 171 |
log_messages.append(f"檢索過程出錯: {e}")
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
# 2. 檢索失敗或相似度低,嘗試LLM
|
| 174 |
+
log_messages.append("嘗試LLM生成...")
|
| 175 |
|
| 176 |
+
# 構建簡單提示詞
|
| 177 |
+
prompt = f"""請為這個問題生成SQL查詢:
|
| 178 |
|
| 179 |
+
數據庫結構:
|
| 180 |
{SCHEMA_CONTEXT}
|
| 181 |
|
| 182 |
問題:{user_question}
|
| 183 |
|
| 184 |
+
SQL:"""
|
| 185 |
|
| 186 |
+
generated_sql = call_llm_api(prompt)
|
| 187 |
|
| 188 |
+
if generated_sql:
|
| 189 |
+
# 清理輸出
|
| 190 |
+
generated_sql = re.sub(r'^```sql|```$', '', generated_sql).strip()
|
| 191 |
+
log_messages.append("LLM生成成功!")
|
| 192 |
+
return generated_sql, "\n".join(log_messages)
|
| 193 |
+
else:
|
| 194 |
+
# 3. LLM也失敗,提供智能備用答案
|
| 195 |
+
log_messages.append("所有LLM模型都失敗,提供備用答案")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
# 基於問題內容提供有意義的備用SQL
|
| 198 |
+
if any(keyword in user_question.lower() for keyword in ['銷售', '業績', '金額']):
|
| 199 |
+
backup_sql = "SELECT product_name, SUM(sales_amount) as total_sales FROM sales GROUP BY product_name ORDER BY total_sales DESC LIMIT 10;"
|
| 200 |
+
elif any(keyword in user_question.lower() for keyword in ['客戶', '買家', '用戶']):
|
| 201 |
+
backup_sql = "SELECT customer_name, COUNT(*) as order_count FROM orders GROUP BY customer_name ORDER BY order_count DESC;"
|
| 202 |
+
elif any(keyword in user_question.lower() for keyword in ['時間', '日期', '最近']):
|
| 203 |
+
backup_sql = "SELECT DATE(order_date) as day, COUNT(*) as orders FROM orders WHERE order_date >= DATE('now', '-7 days') GROUP BY day ORDER BY day DESC;"
|
|
|
|
|
|
|
|
|
|
| 204 |
else:
|
| 205 |
+
backup_sql = "SELECT '請重試或聯繫管理員' AS status;"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
|
|
|
|
|
|
| 207 |
return backup_sql, "\n".join(log_messages)
|
| 208 |
|
| 209 |
+
# --- 6. 創建界面 ---
|
| 210 |
print("--- [5/5] 正在創建 Web 界面... ---")
|
| 211 |
|
| 212 |
with gr.Blocks(title="智能Text-to-SQL系統") as demo:
|
| 213 |
+
gr.Markdown("# 🚀 智能 Text-to-SQL 系統")
|
| 214 |
gr.Markdown("輸入自然語言問題,自動生成SQL查詢")
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
question_input = gr.Textbox(
|
| 218 |
label="您的問題",
|
| 219 |
+
placeholder="例如:查詢2024年的銷售數據",
|
| 220 |
lines=2
|
| 221 |
)
|
| 222 |
|
| 223 |
with gr.Row():
|
| 224 |
submit_btn = gr.Button("生成SQL", variant="primary")
|
|
|
|
| 225 |
|
| 226 |
with gr.Row():
|
| 227 |
sql_output = gr.Code(
|
| 228 |
label="生成的SQL",
|
| 229 |
language="sql",
|
| 230 |
+
lines=6
|
| 231 |
)
|
| 232 |
|
| 233 |
with gr.Row():
|
| 234 |
log_output = gr.Textbox(
|
| 235 |
label="執行日誌",
|
| 236 |
+
lines=4,
|
| 237 |
interactive=False
|
| 238 |
)
|
| 239 |
|
|
|
|
| 244 |
outputs=[sql_output, log_output]
|
| 245 |
)
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
# 示例
|
| 248 |
gr.Examples(
|
| 249 |
examples=[
|
| 250 |
+
"2024年銷售額最高的產品",
|
| 251 |
+
"最近30天的訂單統計",
|
| 252 |
+
"每個客戶的訂單數量",
|
| 253 |
+
"庫存不足的商品列表"
|
| 254 |
],
|
| 255 |
inputs=question_input
|
| 256 |
)
|
| 257 |
|
| 258 |
print("--- 應用啟動完成 ---")
|
| 259 |
+
print("--- 訪問地址: http://localhost:7860 ---")
|
| 260 |
if __name__ == "__main__":
|
| 261 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|