Spaces:
Paused
Paused
| import sys | |
| import os | |
| # --- PATH FIXER (CRITICAL) --- | |
| # This ensures we can import 'src' modules regardless of where we run this script from | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) # .../src/tools | |
| src_dir = os.path.dirname(current_dir) # .../src | |
| project_root = os.path.dirname(src_dir) # .../ProjectA_Backup | |
| if project_root not in sys.path: | |
| sys.path.insert(0, project_root) | |
| # ----------------------------- | |
| import json | |
| import glob | |
| from pdf2image import convert_from_path | |
| from src.agents.vision import VisionAgent | |
| from tqdm import tqdm | |
| # CONFIG | |
| DOCS_DIR = os.path.join(src_dir, "data", "docs") | |
| OUTPUT_FILE = os.path.join(src_dir, "data", "chart_data.jsonl") | |
| def extract_visual_data(): | |
| print("ποΈ Initializing Vision Agent for Chart Reading...") | |
| try: | |
| vision = VisionAgent() | |
| except Exception as e: | |
| print(f"β Failed to load Vision Agent: {e}") | |
| return | |
| pdf_files = glob.glob(os.path.join(DOCS_DIR, "*.pdf")) | |
| if not pdf_files: | |
| print(f"β No PDFs found in {DOCS_DIR}") | |
| return | |
| extracted_data = [] | |
| print(f"π Found {len(pdf_files)} PDFs. Scanning for charts...") | |
| for pdf_path in pdf_files: | |
| filename = os.path.basename(pdf_path) | |
| print(f" π Processing {filename}...") | |
| try: | |
| # Convert PDF pages to Images (in memory) | |
| # We limit to first 10 pages for demo speed. Remove [:10] for full scan. | |
| pages = convert_from_path(pdf_path)[:10] | |
| for i, page_image in enumerate(pages): | |
| # Save temp image for Vision Agent | |
| temp_img_path = "temp_page.jpg" | |
| page_image.save(temp_img_path, "JPEG") | |
| # 1. Run Captioning (To understand the graph shape/trend) | |
| caption = vision.analyze_image(temp_img_path, task_hint="describe caption") | |
| # 2. Run OCR (To read the numbers on the axes) | |
| text_data = vision.analyze_image(temp_img_path, task_hint="OCR") | |
| # Combine them into a "Visual Context" | |
| combined_context = f"Page {i+1} Visual Data:\n- Description: {caption}\n- Text Content: {text_data}" | |
| # Check if this page actually has a chart (heuristic) | |
| # If the description mentions "chart", "graph", "plot", we keep it. | |
| keywords = ["chart", "graph", "plot", "diagram", "figure", "table", "biα»u Δα»"] | |
| if any(k in caption.lower() for k in keywords): | |
| print(f" π Found Chart on Page {i+1}") | |
| extracted_data.append({ | |
| "source": filename, | |
| "page": i+1, | |
| "content": combined_context | |
| }) | |
| except Exception as e: | |
| print(f"β Error processing {filename}: {e}") | |
| # Save raw visual data | |
| os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: | |
| for entry in extracted_data: | |
| f.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| print(f"β Extracted {len(extracted_data)} visual insights.") | |
| print(f"π File saved: {OUTPUT_FILE}") | |
| if __name__ == "__main__": | |
| extract_visual_data() |