Spaces:
Paused
Paused
File size: 3,425 Bytes
1804a7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import sys
import os
# --- PATH FIXER (CRITICAL) ---
# This ensures we can import 'src' modules regardless of where we run this script from
current_dir = os.path.dirname(os.path.abspath(__file__)) # .../src/tools
src_dir = os.path.dirname(current_dir) # .../src
project_root = os.path.dirname(src_dir) # .../ProjectA_Backup
if project_root not in sys.path:
sys.path.insert(0, project_root)
# -----------------------------
import json
import glob
from pdf2image import convert_from_path
from src.agents.vision import VisionAgent
from tqdm import tqdm
# CONFIG
DOCS_DIR = os.path.join(src_dir, "data", "docs")
OUTPUT_FILE = os.path.join(src_dir, "data", "chart_data.jsonl")
def extract_visual_data():
print("ποΈ Initializing Vision Agent for Chart Reading...")
try:
vision = VisionAgent()
except Exception as e:
print(f"β Failed to load Vision Agent: {e}")
return
pdf_files = glob.glob(os.path.join(DOCS_DIR, "*.pdf"))
if not pdf_files:
print(f"β No PDFs found in {DOCS_DIR}")
return
extracted_data = []
print(f"π Found {len(pdf_files)} PDFs. Scanning for charts...")
for pdf_path in pdf_files:
filename = os.path.basename(pdf_path)
print(f" π Processing {filename}...")
try:
# Convert PDF pages to Images (in memory)
# We limit to first 10 pages for demo speed. Remove [:10] for full scan.
pages = convert_from_path(pdf_path)[:10]
for i, page_image in enumerate(pages):
# Save temp image for Vision Agent
temp_img_path = "temp_page.jpg"
page_image.save(temp_img_path, "JPEG")
# 1. Run Captioning (To understand the graph shape/trend)
caption = vision.analyze_image(temp_img_path, task_hint="describe caption")
# 2. Run OCR (To read the numbers on the axes)
text_data = vision.analyze_image(temp_img_path, task_hint="OCR")
# Combine them into a "Visual Context"
combined_context = f"Page {i+1} Visual Data:\n- Description: {caption}\n- Text Content: {text_data}"
# Check if this page actually has a chart (heuristic)
# If the description mentions "chart", "graph", "plot", we keep it.
keywords = ["chart", "graph", "plot", "diagram", "figure", "table", "biα»u Δα»"]
if any(k in caption.lower() for k in keywords):
print(f" π Found Chart on Page {i+1}")
extracted_data.append({
"source": filename,
"page": i+1,
"content": combined_context
})
except Exception as e:
print(f"β Error processing {filename}: {e}")
# Save raw visual data
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in extracted_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"β
Extracted {len(extracted_data)} visual insights.")
print(f"π File saved: {OUTPUT_FILE}")
if __name__ == "__main__":
extract_visual_data() |