Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,108 +1,87 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
-
import
|
|
|
|
| 5 |
import re
|
| 6 |
from huggingface_hub import HfApi, hf_hub_download
|
| 7 |
|
| 8 |
# --- 核心配置 ---
|
| 9 |
-
|
|
|
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 11 |
-
MODEL_ID = "gemini-1.5-flash"
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
<h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
|
| 19 |
-
<p>本系统依托 <b>MG 核心智库</b> 构建,旨在实现解析结果实时溯源至各国官方税收协定与法律文本。目前系统正处于<b>知识库全量装载阶段</b>,已优先上线核心业务国家的官方协定库。</p>
|
| 20 |
-
<p>我们正持续同步全球各主要经济体的国别投资税收指南及多税种年度税收报告。受限于测试版的数据填充进度,相关解析结果仅供专业参考。MG 团队正加速完善每一条咨询建议的合规证据链,以确保交付专家级的数字化合规支持。</p>
|
| 21 |
-
<hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
|
| 22 |
-
<p style="font-size: 0.85em; color: #666;">
|
| 23 |
-
<b>⚠️ AI 免责声明:</b><br>
|
| 24 |
-
本系统生成的内容由人工智能根据现有库文件分析得出,不构成正式的法律或税务建议。在使用本系统结果进行任何商业决策前,请务必咨询 MG Consult 专业团队。
|
| 25 |
-
</p>
|
| 26 |
-
</div>
|
| 27 |
-
"""
|
| 28 |
-
|
| 29 |
-
def fetch_dataset_context(query):
|
| 30 |
-
if not HF_TOKEN: return ""
|
| 31 |
-
api = HfApi(token=HF_TOKEN)
|
| 32 |
-
combined_text = ""
|
| 33 |
-
keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
|
| 34 |
-
if not keywords: keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
|
| 35 |
-
|
| 36 |
-
for repo in DATASETS:
|
| 37 |
-
try:
|
| 38 |
-
files = api.list_repo_files(repo_id=repo, repo_type="dataset")
|
| 39 |
-
matched = sorted([f for f in files if f.lower().endswith(".pdf") and any(k.lower() in f.lower() for k in keywords)])
|
| 40 |
-
for f_path in matched[:8]:
|
| 41 |
-
temp_path = hf_hub_download(repo_id=repo, filename=f_path, repo_type="dataset", token=HF_TOKEN)
|
| 42 |
-
doc = fitz.open(temp_path)
|
| 43 |
-
combined_text += f"\n[Ref: {f_path}]\n" + "".join([page.get_text() for page in doc[:15]])
|
| 44 |
-
doc.close()
|
| 45 |
-
os.remove(temp_path)
|
| 46 |
-
except: continue
|
| 47 |
-
return combined_text[:12000]
|
| 48 |
-
|
| 49 |
-
def find_local_context(query):
|
| 50 |
-
base_dir = "./treaties"
|
| 51 |
-
if not os.path.exists(base_dir): return ""
|
| 52 |
try:
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
return "\n".join(texts)[:8000]
|
| 60 |
-
except: return ""
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
if not
|
| 65 |
-
|
| 66 |
-
# --- 核心修改:在系统提示词中强制要求 AI 保持静默,不要自我介绍 ---
|
| 67 |
-
full_system_prompt = (
|
| 68 |
-
f"{system_message}\n\n"
|
| 69 |
-
"【重要约束】:\n"
|
| 70 |
-
"1. 严禁进行任何形式的自我介绍或身份说明(例如:不要说'我是MG的专家'、'你好'等)。\n"
|
| 71 |
-
"2. 严禁包含任何开场白,直接针对用户问题进入专业分析。\n"
|
| 72 |
-
"3. 必须严格基于以下参考资料进行回答。\n\n"
|
| 73 |
-
f"参考资料:\n{knowledge}"
|
| 74 |
-
)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_ID}:generateContent?key={GEMINI_API_KEY}"
|
| 83 |
try:
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
#
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
gr.Slider(0, 1, 0.95, label="采样率 (Top-p)"),
|
| 104 |
-
],
|
| 105 |
-
)
|
| 106 |
|
|
|
|
| 107 |
if __name__ == "__main__":
|
| 108 |
-
demo.launch()
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
+
import json
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
import re
|
| 7 |
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
|
| 9 |
# --- 核心配置 ---
|
| 10 |
+
# 建议在 Hugging Face 的 Settings -> Secrets 中设置这些变量
|
| 11 |
+
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 13 |
|
| 14 |
+
# 推荐使用 OpenRouter 的免费模型,稳定性比直连好
|
| 15 |
+
# 备选: "deepseek/deepseek-chat:free" 或 "meta-llama/llama-3.3-70b-instruct:free"
|
| 16 |
+
MODEL_ID = "google/gemini-2.0-flash-001"
|
| 17 |
|
| 18 |
+
# --- PDF 处理逻辑 ---
|
| 19 |
+
def extract_text_from_pdf(pdf_path):
|
| 20 |
+
text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
try:
|
| 22 |
+
with fitz.open(pdf_path) as doc:
|
| 23 |
+
for page in doc:
|
| 24 |
+
text += page.get_text()
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"读取 PDF 出错: {e}")
|
| 27 |
+
return text
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# --- OpenRouter API 调用逻辑 ---
|
| 30 |
+
def ask_ai(user_query, context=""):
|
| 31 |
+
if not OPENROUTER_API_KEY:
|
| 32 |
+
return "错误:未配置 OPENROUTER_API_KEY。请在环境变量中设置。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
url = "https://openrouter.ai/api/v1/chat/completions"
|
| 35 |
+
headers = {
|
| 36 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 37 |
+
"HTTP-Referer": "https://mgconsult.net", # 可选
|
| 38 |
+
"X-Title": "TaxAI Assistant", # 可选
|
| 39 |
+
"Content-Type": "application/json"
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# 构造 Prompt,将 PDF 内容作为背景知识
|
| 43 |
+
full_prompt = f"背景知识:\n{context}\n\n问题:\n{user_query}" if context else user_query
|
| 44 |
+
|
| 45 |
+
payload = {
|
| 46 |
+
"model": MODEL_ID,
|
| 47 |
+
"messages": [
|
| 48 |
+
{"role": "system", "content": "你是一个专业的国际税务与贸易合规专家,请基于提供的背景知识回答问题。"},
|
| 49 |
+
{"role": "user", "content": full_prompt}
|
| 50 |
+
],
|
| 51 |
+
"temperature": 0.7
|
| 52 |
+
}
|
| 53 |
|
|
|
|
| 54 |
try:
|
| 55 |
+
response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=30)
|
| 56 |
+
if response.status_code == 200:
|
| 57 |
+
result = response.json()
|
| 58 |
+
return result['choices'][0]['message']['content']
|
| 59 |
+
elif response.status_code == 429:
|
| 60 |
+
return "系统繁忙 (429):OpenRouter 的免费额度也暂时达到上限,请等待一分钟再试。"
|
| 61 |
+
else:
|
| 62 |
+
return f"API 报错: {response.status_code} - {response.text}"
|
| 63 |
+
except Exception as e:
|
| 64 |
+
return f"请求失败: {str(e)}"
|
| 65 |
|
| 66 |
+
# --- Gradio 界面处理 ---
|
| 67 |
+
def chat_handler(message, history):
|
| 68 |
+
# 这里可以添加你之前的数据集检索逻辑 (RAG)
|
| 69 |
+
# 目前演示直接调用 AI
|
| 70 |
+
response = ask_ai(message)
|
| 71 |
+
return response
|
| 72 |
|
| 73 |
+
# --- 构建 Gradio UI ---
|
| 74 |
+
with gr.Blocks(title="MG Consulting TaxAI") as demo:
|
| 75 |
+
gr.Markdown("# 🌍 MG Consulting 国际税务 AI 助手")
|
| 76 |
+
gr.Markdown("基于 OpenRouter 引擎,支持国际税收协定与投资指南查询。")
|
| 77 |
+
|
| 78 |
+
chatbot = gr.ChatInterface(
|
| 79 |
+
fn=chat_handler,
|
| 80 |
+
examples=["美国个人所得税税率是多少?", "如何查询丹麦的税收协定?"],
|
| 81 |
+
cache_examples=False,
|
| 82 |
+
)
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
# --- 启动应用 ---
|
| 85 |
if __name__ == "__main__":
|
| 86 |
+
# 这里的缩进非常重要,确保 demo.launch() 在 if 语句块内
|
| 87 |
+
demo.launch()
|