File size: 3,425 Bytes
1804a7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys
import os

# --- PATH FIXER (CRITICAL) ---
# This ensures we can import 'src' modules regardless of where we run this script from
current_dir = os.path.dirname(os.path.abspath(__file__)) # .../src/tools
src_dir = os.path.dirname(current_dir)                # .../src
project_root = os.path.dirname(src_dir)               # .../ProjectA_Backup

if project_root not in sys.path:
    sys.path.insert(0, project_root)
# -----------------------------

import json
import glob
from pdf2image import convert_from_path
from src.agents.vision import VisionAgent
from tqdm import tqdm

# CONFIG
DOCS_DIR = os.path.join(src_dir, "data", "docs")
OUTPUT_FILE = os.path.join(src_dir, "data", "chart_data.jsonl")

def extract_visual_data():
    print("πŸ‘οΈ Initializing Vision Agent for Chart Reading...")
    
    try:
        vision = VisionAgent()
    except Exception as e:
        print(f"❌ Failed to load Vision Agent: {e}")
        return
    
    pdf_files = glob.glob(os.path.join(DOCS_DIR, "*.pdf"))
    if not pdf_files:
        print(f"❌ No PDFs found in {DOCS_DIR}")
        return

    extracted_data = []

    print(f"πŸ“‚ Found {len(pdf_files)} PDFs. Scanning for charts...")

    for pdf_path in pdf_files:
        filename = os.path.basename(pdf_path)
        print(f"   πŸ“„ Processing {filename}...")
        
        try:
            # Convert PDF pages to Images (in memory)
            # We limit to first 10 pages for demo speed. Remove [:10] for full scan.
            pages = convert_from_path(pdf_path)[:10] 
            
            for i, page_image in enumerate(pages):
                # Save temp image for Vision Agent
                temp_img_path = "temp_page.jpg"
                page_image.save(temp_img_path, "JPEG")
                
                # 1. Run Captioning (To understand the graph shape/trend)
                caption = vision.analyze_image(temp_img_path, task_hint="describe caption")
                
                # 2. Run OCR (To read the numbers on the axes)
                text_data = vision.analyze_image(temp_img_path, task_hint="OCR")
                
                # Combine them into a "Visual Context"
                combined_context = f"Page {i+1} Visual Data:\n- Description: {caption}\n- Text Content: {text_data}"
                
                # Check if this page actually has a chart (heuristic)
                # If the description mentions "chart", "graph", "plot", we keep it.
                keywords = ["chart", "graph", "plot", "diagram", "figure", "table", "biểu Δ‘α»“"]
                if any(k in caption.lower() for k in keywords):
                    print(f"      πŸ“Š Found Chart on Page {i+1}")
                    extracted_data.append({
                        "source": filename,
                        "page": i+1,
                        "content": combined_context
                    })
                
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    # Save raw visual data
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in extracted_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
            
    print(f"βœ… Extracted {len(extracted_data)} visual insights.")
    print(f"πŸ‘‰ File saved: {OUTPUT_FILE}")

if __name__ == "__main__":
    extract_visual_data()