MGGroup commited on
Commit
402892d
·
verified ·
1 Parent(s): a2b4469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -89
app.py CHANGED
@@ -1,108 +1,87 @@
1
  import gradio as gr
2
  import requests
3
  import os
4
- import fitz
 
5
  import re
6
  from huggingface_hub import HfApi, hf_hub_download
7
 
8
  # --- 核心配置 ---
9
- GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
11
- MODEL_ID = "gemini-1.5-flash"
12
 
13
- DATASETS = ["MGGroup/Treaties", "MGGroup/InvestmentGuide"]
 
 
14
 
15
- # 顶部的专业描述(保持原样,符合您的要求)
16
- DESCRIPTION = """
17
- <div style="text-align: left; border-left: 4px solid #2196F3; padding-left: 15px; margin-bottom: 20px;">
18
- <h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
19
- <p>本系统依托 <b>MG 核心智库</b> 构建,旨在实现解析结果实时溯源至各国官方税收协定与法律文本。目前系统正处于<b>知识库全量装载阶段</b>,已优先上线核心业务国家的官方协定库。</p>
20
- <p>我们正持续同步全球各主要经济体的国别投资税收指南及多税种年度税收报告。受限于测试版的数据填充进度,相关解析结果仅供专业参考。MG 团队正加速完善每一条咨询建议的合规证据链,以确保交付专家级的数字化合规支持。</p>
21
- <hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
22
- <p style="font-size: 0.85em; color: #666;">
23
- <b>⚠️ AI 免责声明:</b><br>
24
- 本系统生成的内容由人工智能根据现有库文件分析得出,不构成正式的法律或税务建议。在使用本系统结果进行任何商业决策前,请务必咨询 MG Consult 专业团队。
25
- </p>
26
- </div>
27
- """
28
-
29
- def fetch_dataset_context(query):
30
- if not HF_TOKEN: return ""
31
- api = HfApi(token=HF_TOKEN)
32
- combined_text = ""
33
- keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
34
- if not keywords: keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
35
-
36
- for repo in DATASETS:
37
- try:
38
- files = api.list_repo_files(repo_id=repo, repo_type="dataset")
39
- matched = sorted([f for f in files if f.lower().endswith(".pdf") and any(k.lower() in f.lower() for k in keywords)])
40
- for f_path in matched[:8]:
41
- temp_path = hf_hub_download(repo_id=repo, filename=f_path, repo_type="dataset", token=HF_TOKEN)
42
- doc = fitz.open(temp_path)
43
- combined_text += f"\n[Ref: {f_path}]\n" + "".join([page.get_text() for page in doc[:15]])
44
- doc.close()
45
- os.remove(temp_path)
46
- except: continue
47
- return combined_text[:12000]
48
-
49
- def find_local_context(query):
50
- base_dir = "./treaties"
51
- if not os.path.exists(base_dir): return ""
52
  try:
53
- keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
54
- folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
55
- selected = next((f for f in folders if any(k.lower() in f.lower() for k in keywords)), None)
56
- if not selected: return ""
57
- path = os.path.join(base_dir, selected)
58
- texts = [f"\n[Local: {pdf}]\n" + "".join([p.get_text() for p in fitz.open(os.path.join(path, pdf))[:15]]) for pdf in sorted([f for f in os.listdir(path) if f.endswith(".pdf")])[:5]]
59
- return "\n".join(texts)[:8000]
60
- except: return ""
61
 
62
- def respond(message, history, system_message, max_tokens, temperature, top_p):
63
- knowledge = fetch_dataset_context(message)
64
- if not knowledge: knowledge = find_local_context(message)
65
-
66
- # --- 核心修改:在系统提示词中强制要求 AI 保持静默,不要自我介绍 ---
67
- full_system_prompt = (
68
- f"{system_message}\n\n"
69
- "【重要约束】:\n"
70
- "1. 严禁进行任何形式的自我介绍或身份说明(例如:不要说'我是MG的专家'、'你好'等)。\n"
71
- "2. 严禁包含任何开场白,直接针对用户问题进入专业分析。\n"
72
- "3. 必须严格基于以下参考资料进行回答。\n\n"
73
- f"参考资料:\n{knowledge}"
74
- )
75
 
76
- messages = [{"role": "system", "content": full_system_prompt}]
77
- for u, a in history:
78
- if u: messages.append({"role": "user", "content": u})
79
- if a: messages.append({"role": "assistant", "content": a})
80
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_ID}:generateContent?key={GEMINI_API_KEY}"
83
  try:
84
- res = requests.post(url, json={"contents": [{"role":"user" if m["role"] in ["user","system"] else "model", "parts":[{"text":m["content"]}]} for m in messages]}, timeout=60)
85
- reply = res.json()['candidates'][0]['content']['parts'][0]['text']
86
- except:
87
- reply = "系统繁忙,请稍后再试。"
 
 
 
 
 
 
88
 
89
- # --- 唯一的身份开场白在这里定义,由代码控制,不给 AI 发挥空间 ---
90
- header = "您好,我是 **MG Consult** 的国际税收 AI 专家。很高兴为您提供专业咨询。\n\n---\n\n"
91
- yield header + reply
 
 
 
92
 
93
- # 界面
94
- demo = gr.ChatInterface(
95
- fn=respond,
96
- description=DESCRIPTION,
97
- theme="soft",
98
- css=".gradio-container {max-width: 950px !important} .description {margin-bottom: 20px}",
99
- additional_inputs=[
100
- gr.Textbox(value="你代表 MG Consult,是收专家。请严格基于参考资料提供深度分析。请直接进入正文,严禁自我介绍。", label="系统指令"),
101
- gr.Slider(512, 4096, 2048, label="回复长度限制"),
102
- gr.Slider(0, 1, 0.05, label="严谨度 (Temperature)"),
103
- gr.Slider(0, 1, 0.95, label="采样率 (Top-p)"),
104
- ],
105
- )
106
 
 
107
  if __name__ == "__main__":
108
- demo.launch() # 前面至少要有 4 个空格或 1 个 Tab
 
 
1
  import gradio as gr
2
  import requests
3
  import os
4
+ import json
5
+ import fitz # PyMuPDF
6
  import re
7
  from huggingface_hub import HfApi, hf_hub_download
8
 
9
  # --- 核心配置 ---
10
+ # 建议在 Hugging Face 的 Settings -> Secrets 中设置这些变量
11
+ OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
13
 
14
+ # 推荐使用 OpenRouter 的免费模型,稳定性比直连好
15
+ # 备选: "deepseek/deepseek-chat:free" 或 "meta-llama/llama-3.3-70b-instruct:free"
16
+ MODEL_ID = "google/gemini-2.0-flash-001"
17
 
18
+ # --- PDF 处理逻辑 ---
19
+ def extract_text_from_pdf(pdf_path):
20
+ text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
22
+ with fitz.open(pdf_path) as doc:
23
+ for page in doc:
24
+ text += page.get_text()
25
+ except Exception as e:
26
+ print(f"读取 PDF 出错: {e}")
27
+ return text
 
 
28
 
29
+ # --- OpenRouter API 调用逻辑 ---
30
+ def ask_ai(user_query, context=""):
31
+ if not OPENROUTER_API_KEY:
32
+ return "错误:未配置 OPENROUTER_API_KEY。请在环境变量中设置。"
 
 
 
 
 
 
 
 
 
33
 
34
+ url = "https://openrouter.ai/api/v1/chat/completions"
35
+ headers = {
36
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
37
+ "HTTP-Referer": "https://mgconsult.net", # 可选
38
+ "X-Title": "TaxAI Assistant", # 可选
39
+ "Content-Type": "application/json"
40
+ }
41
+
42
+ # 构造 Prompt,将 PDF 内容作为背景知识
43
+ full_prompt = f"背景知识:\n{context}\n\n问题:\n{user_query}" if context else user_query
44
+
45
+ payload = {
46
+ "model": MODEL_ID,
47
+ "messages": [
48
+ {"role": "system", "content": "你是一个专业的国际税务与贸易合规专家,请基于提供的背景知识回答问题。"},
49
+ {"role": "user", "content": full_prompt}
50
+ ],
51
+ "temperature": 0.7
52
+ }
53
 
 
54
  try:
55
+ response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=30)
56
+ if response.status_code == 200:
57
+ result = response.json()
58
+ return result['choices'][0]['message']['content']
59
+ elif response.status_code == 429:
60
+ return "系统繁忙 (429):OpenRouter 的免费额度也暂时达到上限,请等待一分钟再试。"
61
+ else:
62
+ return f"API 报错: {response.status_code} - {response.text}"
63
+ except Exception as e:
64
+ return f"请求失败: {str(e)}"
65
 
66
+ # --- Gradio 界面处理 ---
67
+ def chat_handler(message, history):
68
+ # 这里可以添加你之前的数据集检索逻辑 (RAG)
69
+ # 目前演示直接调用 AI
70
+ response = ask_ai(message)
71
+ return response
72
 
73
+ # --- 构建 Gradio UI ---
74
+ with gr.Blocks(title="MG Consulting TaxAI") as demo:
75
+ gr.Markdown("# 🌍 MG Consulting 国际税务 AI 助手")
76
+ gr.Markdown("基于 OpenRouter 引擎,支持国际税收协定与投资指南查询。")
77
+
78
+ chatbot = gr.ChatInterface(
79
+ fn=chat_handler,
80
+ examples=["个人所得税率是多少?", "如何查询丹麦的税收协定?"],
81
+ cache_examples=False,
82
+ )
 
 
 
83
 
84
+ # --- 启动应用 ---
85
  if __name__ == "__main__":
86
+ # 这里的缩进非常重要,确保 demo.launch() if 语句块内
87
+ demo.launch()