Spaces:
Paused
Paused
File size: 3,907 Bytes
1804a7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
import json
import time
from openai import OpenAI
# --- CONFIGURATION ---
# Use your DeepSeek Key
API_KEY = "sk-11dbe37b519f43a2939caccd3a7beabe"
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner" # Use Thinking Mode for deep analysis
# Input/Output
INPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/chart_data.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/distilled_reasoning_charts.jsonl"
def load_chart_data():
data = []
if os.path.exists(INPUT_FILE):
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data.append(json.loads(line))
return data
def distill_charts(client, chunks):
dataset = []
print(f"π§ͺ Starting Chart Distillation (Teacher: {MODEL_NAME})...")
for i, chunk in enumerate(chunks):
print(f" π Analyzing Chart {i+1}/{len(chunks)} from {chunk['source']}...")
# The Prompt: Forces the Teacher to simulate a business scenario based on the visual data
prompt = f'''
CONTEXT: The text below is a computer vision description of a Chart/Graph from a Vietnam Retail Report.
VISUAL DATA:
"""{chunk['content']}"""
TASK:
1. **Interpret:** What is the key business insight from this graph?
2. **Scenario:** Imagine a Vietnamese Store Owner asking a question that requires this specific data to answer.
3. **Reasoning:** Show your internal logic (Chain of Thought).
4. **Response:** Answer the store owner in Vietnamese, citing the trend in the graph.
OUTPUT JSON FORMAT:
{{
"user_query": "The question (Vietnamese)",
"thought_process": "The reasoning (English or Vietnamese)",
"response": "The final answer (Vietnamese)"
}}
'''
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a JSON Data Generator."},
{"role": "user", "content": prompt}
],
response_format={ 'type': 'json_object' }
)
content = response.choices[0].message.content
data = json.loads(content)
# Format for Qwen Training
full_response = f"<think>{data['thought_process']}</think>\n{data['response']}"
entry = {
"messages": [
{"role": "system", "content": "You are Project A, an expert Data Analyst."},
{"role": "user", "content": data['user_query']},
{"role": "assistant", "content": full_response}
]
}
dataset.append(entry)
print(f" β
Generated Insight: {data['user_query'][:50]}...")
except Exception as e:
print(f" β οΈ API Error: {e}")
return dataset
def main():
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
# 1. Load Data
chart_chunks = load_chart_data()
if not chart_chunks:
print(f"β No data found at {INPUT_FILE}. Run 'ingest_charts.py' first.")
return
print(f"π Found {len(chart_chunks)} chart descriptions.")
# 2. Run Distillation
# Limit to first 20 for demo speed (remove [:20] for full run)
new_data = distill_charts(client, chart_chunks[:20])
# 3. Save
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in new_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"π Success! Saved {len(new_data)} training samples to {OUTPUT_FILE}")
if __name__ == "__main__":
main() |