Multi_Agent_Model / src /tools /refine_charts.py
sonthaiha
Fresh Deployment with LFS
1804a7a
import os
import json
import time
from openai import OpenAI
# --- CONFIGURATION ---
# Use your DeepSeek Key
API_KEY = "sk-11dbe37b519f43a2939caccd3a7beabe"
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner" # Use Thinking Mode for deep analysis
# Input/Output
INPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/chart_data.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/distilled_reasoning_charts.jsonl"
def load_chart_data():
data = []
if os.path.exists(INPUT_FILE):
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data.append(json.loads(line))
return data
def distill_charts(client, chunks):
dataset = []
print(f"πŸ§ͺ Starting Chart Distillation (Teacher: {MODEL_NAME})...")
for i, chunk in enumerate(chunks):
print(f" πŸ“Š Analyzing Chart {i+1}/{len(chunks)} from {chunk['source']}...")
# The Prompt: Forces the Teacher to simulate a business scenario based on the visual data
prompt = f'''
CONTEXT: The text below is a computer vision description of a Chart/Graph from a Vietnam Retail Report.
VISUAL DATA:
"""{chunk['content']}"""
TASK:
1. **Interpret:** What is the key business insight from this graph?
2. **Scenario:** Imagine a Vietnamese Store Owner asking a question that requires this specific data to answer.
3. **Reasoning:** Show your internal logic (Chain of Thought).
4. **Response:** Answer the store owner in Vietnamese, citing the trend in the graph.
OUTPUT JSON FORMAT:
{{
"user_query": "The question (Vietnamese)",
"thought_process": "The reasoning (English or Vietnamese)",
"response": "The final answer (Vietnamese)"
}}
'''
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a JSON Data Generator."},
{"role": "user", "content": prompt}
],
response_format={ 'type': 'json_object' }
)
content = response.choices[0].message.content
data = json.loads(content)
# Format for Qwen Training
full_response = f"<think>{data['thought_process']}</think>\n{data['response']}"
entry = {
"messages": [
{"role": "system", "content": "You are Project A, an expert Data Analyst."},
{"role": "user", "content": data['user_query']},
{"role": "assistant", "content": full_response}
]
}
dataset.append(entry)
print(f" βœ… Generated Insight: {data['user_query'][:50]}...")
except Exception as e:
print(f" ⚠️ API Error: {e}")
return dataset
def main():
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
# 1. Load Data
chart_chunks = load_chart_data()
if not chart_chunks:
print(f"❌ No data found at {INPUT_FILE}. Run 'ingest_charts.py' first.")
return
print(f"πŸ” Found {len(chart_chunks)} chart descriptions.")
# 2. Run Distillation
# Limit to first 20 for demo speed (remove [:20] for full run)
new_data = distill_charts(client, chart_chunks[:20])
# 3. Save
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in new_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"πŸŽ‰ Success! Saved {len(new_data)} training samples to {OUTPUT_FILE}")
if __name__ == "__main__":
main()