File size: 3,907 Bytes
1804a7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import json
import time
from openai import OpenAI

# --- CONFIGURATION ---
# Use your DeepSeek Key
API_KEY = "sk-11dbe37b519f43a2939caccd3a7beabe" 
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner"  # Use Thinking Mode for deep analysis

# Input/Output
INPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/chart_data.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/distilled_reasoning_charts.jsonl"

def load_chart_data():
    data = []
    if os.path.exists(INPUT_FILE):
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    data.append(json.loads(line))
    return data

def distill_charts(client, chunks):
    dataset = []
    print(f"πŸ§ͺ Starting Chart Distillation (Teacher: {MODEL_NAME})...")
    
    for i, chunk in enumerate(chunks):
        print(f"   πŸ“Š Analyzing Chart {i+1}/{len(chunks)} from {chunk['source']}...")
        
        # The Prompt: Forces the Teacher to simulate a business scenario based on the visual data
        prompt = f'''
        CONTEXT: The text below is a computer vision description of a Chart/Graph from a Vietnam Retail Report.
        
        VISUAL DATA:
        """{chunk['content']}"""
        
        TASK:
        1. **Interpret:** What is the key business insight from this graph?
        2. **Scenario:** Imagine a Vietnamese Store Owner asking a question that requires this specific data to answer.
        3. **Reasoning:** Show your internal logic (Chain of Thought).
        4. **Response:** Answer the store owner in Vietnamese, citing the trend in the graph.
        
        OUTPUT JSON FORMAT:
        {{
            "user_query": "The question (Vietnamese)",
            "thought_process": "The reasoning (English or Vietnamese)",
            "response": "The final answer (Vietnamese)"
        }}
        '''
        
        try:
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a JSON Data Generator."},
                    {"role": "user", "content": prompt}
                ],
                response_format={ 'type': 'json_object' } 
            )
            
            content = response.choices[0].message.content
            data = json.loads(content)
            
            # Format for Qwen Training
            full_response = f"<think>{data['thought_process']}</think>\n{data['response']}"
            
            entry = {
                "messages": [
                    {"role": "system", "content": "You are Project A, an expert Data Analyst."},
                    {"role": "user", "content": data['user_query']},
                    {"role": "assistant", "content": full_response}
                ]
            }
            
            dataset.append(entry)
            print(f"      βœ… Generated Insight: {data['user_query'][:50]}...")
            
        except Exception as e:
            print(f"      ⚠️ API Error: {e}")
            
    return dataset

def main():
    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
    
    # 1. Load Data
    chart_chunks = load_chart_data()
    if not chart_chunks:
        print(f"❌ No data found at {INPUT_FILE}. Run 'ingest_charts.py' first.")
        return

    print(f"πŸ” Found {len(chart_chunks)} chart descriptions.")

    # 2. Run Distillation
    # Limit to first 20 for demo speed (remove [:20] for full run)
    new_data = distill_charts(client, chart_chunks[:20])
    
    # 3. Save
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in new_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
            
    print(f"πŸŽ‰ Success! Saved {len(new_data)} training samples to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()