Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -589,61 +589,45 @@ class TextToSQLSystem:
|
|
| 589 |
|
| 590 |
return fixed_sql
|
| 591 |
|
| 592 |
-
# 更新 _format_relevant_schema 以提供更準確的信息
|
| 593 |
def _format_relevant_schema(self, table_names: List[str]) -> str:
|
| 594 |
-
"""
|
|
|
|
|
|
|
| 595 |
if not self.schema:
|
| 596 |
-
return "
|
| 597 |
|
| 598 |
-
# 建立一個從全小寫表名到實際大小寫表名的映射
|
| 599 |
actual_table_names_map = {name.lower(): name for name in self.schema.keys()}
|
| 600 |
-
|
| 601 |
real_table_names = []
|
| 602 |
for table in table_names:
|
| 603 |
-
# 嘗試用小寫名稱去映射,找到正確的大小寫
|
| 604 |
actual_name = actual_table_names_map.get(table.lower())
|
| 605 |
if actual_name:
|
| 606 |
real_table_names.append(actual_name)
|
| 607 |
-
# 如果映射失敗,但原始名稱剛好存在,也加入 (作為備用)
|
| 608 |
elif table in self.schema:
|
| 609 |
real_table_names.append(table)
|
| 610 |
|
| 611 |
-
# 如果根據問題分析後,沒有找到任何相關的表格,則使用預設的核心表格
|
| 612 |
if not real_table_names:
|
| 613 |
self._log("未識別到相關表格,使用預設核心表格。", "WARNING")
|
| 614 |
real_table_names = ['TSR53SampleDescription', 'JobTimeline', 'JobsInProgress']
|
| 615 |
-
# --- END: 修正 NameError 的關鍵程式碼 ---
|
| 616 |
|
| 617 |
-
formatted = "
|
| 618 |
-
formatted += "-- Use ONLY the following tables and columns for the query.\n\n"
|
| 619 |
-
|
| 620 |
-
# 現在這個迴圈可以正常執行了,因為 real_table_names 已經被定義
|
| 621 |
for table in real_table_names:
|
| 622 |
if table in self.schema:
|
| 623 |
-
#
|
| 624 |
-
formatted += f"
|
| 625 |
-
|
| 626 |
-
# 只顯示前
|
| 627 |
-
for col in self.schema[table][:
|
| 628 |
col_name = col['name']
|
| 629 |
col_type = col['type']
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
formatted += ")
|
| 637 |
|
| 638 |
-
|
| 639 |
-
formatted += """-- Important Notes:
|
| 640 |
-
-- - Customer name is in `TSR53SampleDescription.InvoiceToName`.
|
| 641 |
-
-- - Buyer name is in `JobsInProgress.BuyerName` or `TSR53SampleDescription.BuyerName`.
|
| 642 |
-
-- - To get the year from a date, use `strftime('%Y', date_column) = '2024'`.
|
| 643 |
-
-- - Report completion is determined by `JobTimeline.ReportAuthorization`.
|
| 644 |
-
-- - Report rating (Pass/Fail) is in `TSR53SampleDescription.OverallRating`.
|
| 645 |
-
"""
|
| 646 |
-
return formatted
|
| 647 |
|
| 648 |
def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
|
| 649 |
"""使用 FAISS 快速檢索相似問題"""
|
|
@@ -694,58 +678,44 @@ class TextToSQLSystem:
|
|
| 694 |
self._log(f"❌ 檢索失敗: {e}", "ERROR")
|
| 695 |
return []
|
| 696 |
|
|
|
|
|
|
|
| 697 |
def _build_prompt(self, user_q: str, examples: List[Dict]) -> str:
|
| 698 |
"""
|
| 699 |
-
|
| 700 |
"""
|
| 701 |
relevant_tables = self._identify_relevant_tables(user_q)
|
| 702 |
|
| 703 |
-
#
|
| 704 |
-
system_context = "You are an expert AI assistant that generates SQLite queries based on a database schema and a user's question."
|
| 705 |
schema_str = self._format_relevant_schema(relevant_tables)
|
| 706 |
|
| 707 |
-
|
| 708 |
-
ex_str = ""
|
| 709 |
-
# 檢查 examples 列表是否為空,避免出錯
|
| 710 |
if examples:
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
|
|
|
|
|
|
|
|
|
| 721 |
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
- You **must** use SQLite syntax (e.g., use `strftime('%Y', date_column)` for years).
|
| 725 |
-
- You **must** output **nothing** else, only the SQL query inside a single ```sql code block.
|
| 726 |
|
| 727 |
-
|
| 728 |
-
|
|
|
|
|
|
|
| 729 |
```sql
|
| 730 |
"""
|
| 731 |
-
|
| 732 |
-
#
|
| 733 |
-
prompt = f"""{system_context}
|
| 734 |
-
|
| 735 |
-
{schema_str}
|
| 736 |
-
{ex_str}
|
| 737 |
-
{final_task_instruction}
|
| 738 |
-
"""
|
| 739 |
-
|
| 740 |
-
# 5. 限制總長度 (這個邏輯保持不變)
|
| 741 |
-
# 確保在截斷時,最後的指令部分是完整的
|
| 742 |
-
if len(prompt) > 1500:
|
| 743 |
-
# 找到 final_task_instruction 在 prompt 中的起始位置
|
| 744 |
-
instruction_start_index = prompt.find("Now, based on the schema")
|
| 745 |
-
# 保留 schema 和一部分範例,然後接上完整的最終指令
|
| 746 |
-
allowed_context_len = 1500 - len(final_task_instruction)
|
| 747 |
-
prompt = prompt[:allowed_context_len] + "...\n\n" + final_task_instruction
|
| 748 |
-
|
| 749 |
return prompt
|
| 750 |
|
| 751 |
|
|
|
|
| 589 |
|
| 590 |
return fixed_sql
|
| 591 |
|
|
|
|
| 592 |
def _format_relevant_schema(self, table_names: List[str]) -> str:
|
| 593 |
+
"""
|
| 594 |
+
生成一個簡化的、不易被模型錯誤模仿的 Schema 字符串。
|
| 595 |
+
"""
|
| 596 |
if not self.schema:
|
| 597 |
+
return "No schema available.\n"
|
| 598 |
|
|
|
|
| 599 |
actual_table_names_map = {name.lower(): name for name in self.schema.keys()}
|
|
|
|
| 600 |
real_table_names = []
|
| 601 |
for table in table_names:
|
|
|
|
| 602 |
actual_name = actual_table_names_map.get(table.lower())
|
| 603 |
if actual_name:
|
| 604 |
real_table_names.append(actual_name)
|
|
|
|
| 605 |
elif table in self.schema:
|
| 606 |
real_table_names.append(table)
|
| 607 |
|
|
|
|
| 608 |
if not real_table_names:
|
| 609 |
self._log("未識別到相關表格,使用預設核心表格。", "WARNING")
|
| 610 |
real_table_names = ['TSR53SampleDescription', 'JobTimeline', 'JobsInProgress']
|
|
|
|
| 611 |
|
| 612 |
+
formatted = ""
|
|
|
|
|
|
|
|
|
|
| 613 |
for table in real_table_names:
|
| 614 |
if table in self.schema:
|
| 615 |
+
# 使用簡單的 "Table: ..." 和 "Columns: ..." 格式
|
| 616 |
+
formatted += f"Table: {table}\n"
|
| 617 |
+
cols_str = []
|
| 618 |
+
# 只顯示前 10 個關鍵欄位
|
| 619 |
+
for col in self.schema[table][:10]:
|
| 620 |
col_name = col['name']
|
| 621 |
col_type = col['type']
|
| 622 |
+
col_desc = col.get('description', '').replace('\n', ' ')
|
| 623 |
+
# 將描述信息放在括號裡
|
| 624 |
+
if col_desc:
|
| 625 |
+
cols_str.append(f"{col_name} ({col_type}, {col_desc})")
|
| 626 |
+
else:
|
| 627 |
+
cols_str.append(f"{col_name} ({col_type})")
|
| 628 |
+
formatted += f"Columns: {', '.join(cols_str)}\n\n"
|
| 629 |
|
| 630 |
+
return formatted.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
|
| 632 |
def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
|
| 633 |
"""使用 FAISS 快速檢索相似問題"""
|
|
|
|
| 678 |
self._log(f"❌ 檢索失敗: {e}", "ERROR")
|
| 679 |
return []
|
| 680 |
|
| 681 |
+
# in class TextToSQLSystem:
|
| 682 |
+
|
| 683 |
def _build_prompt(self, user_q: str, examples: List[Dict]) -> str:
|
| 684 |
"""
|
| 685 |
+
建立一個高度結構化、以任務為導向的提示詞,使用清晰的標題分隔符。
|
| 686 |
"""
|
| 687 |
relevant_tables = self._identify_relevant_tables(user_q)
|
| 688 |
|
| 689 |
+
# 使用我們新的、更簡單的 schema 格式化函數
|
|
|
|
| 690 |
schema_str = self._format_relevant_schema(relevant_tables)
|
| 691 |
|
| 692 |
+
example_str = "No example available."
|
|
|
|
|
|
|
| 693 |
if examples:
|
| 694 |
+
best_example = examples[0]
|
| 695 |
+
example_str = f"Question: {best_example['question']}\nSQL:\n```sql\n{best_example['sql']}\n```"
|
| 696 |
+
|
| 697 |
+
# 使用強分隔符和清晰的標題來構建 prompt
|
| 698 |
+
prompt = f"""### INSTRUCTIONS ###
|
| 699 |
+
You are a SQLite expert. Your only job is to generate a single, valid SQLite query based on the provided schema and question.
|
| 700 |
+
- ONLY use the tables and columns from the schema below.
|
| 701 |
+
- ALWAYS use SQLite syntax (e.g., `strftime('%Y', date_column)` for years).
|
| 702 |
+
- The report completion date is the `ReportAuthorization` column in the `JobTimeline` table.
|
| 703 |
+
- Your output MUST be ONLY the SQL query inside a ```sql code block.
|
| 704 |
+
|
| 705 |
+
### SCHEMA ###
|
| 706 |
+
{schema_str}
|
| 707 |
|
| 708 |
+
### EXAMPLE ###
|
| 709 |
+
{example_str}
|
|
|
|
|
|
|
| 710 |
|
| 711 |
+
### TASK ###
|
| 712 |
+
Generate a SQLite query for the following question.
|
| 713 |
+
Question: {user_q}
|
| 714 |
+
SQL:
|
| 715 |
```sql
|
| 716 |
"""
|
| 717 |
+
self._log(f"📏 Prompt 長度: {len(prompt)} 字符")
|
| 718 |
+
# 不再需要複雜的長度截斷邏輯,因為 schema 已經被簡化
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
return prompt
|
| 720 |
|
| 721 |
|