Spaces:

vachaspathi
/

Agentic

Sleeping

File size: 1,569 Bytes

3f041f9

# tools_processing.py
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import trafilatura # The best tool for scraping text from generic URLs
import os

# --- TOOL 1: THE EYES (OCR) ---
def perform_ocr(file_obj):
    """
    Converts PDF/Image to text using Tesseract.
    """
    if file_obj is None:
        return None, "No file provided"

    try:
        # Handle PDF
        filename = os.path.basename(file_obj)
        if filename.lower().endswith(".pdf"):
            # Convert 1st page to image
            images = convert_from_path(file_obj, first_page=1, last_page=1)
            image = images[0]
        else:
            image = Image.open(file_obj).convert("RGB")
            
        # Run Tesseract
        text = pytesseract.image_to_string(image)
        return image, text
        
    except Exception as e:
        return None, f"OCR Failed: {str(e)}"

# --- TOOL 2: THE BRAIN FEED (Web Scraper) ---
def scrape_public_link(url):
    """
    Fetches text from a public URL (Notion, Wiki, etc.) without API keys.
    """
    if not url:
        return ""
    
    try:
        print(f"Scraping URL: {url}")
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded)
            if text:
                return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---"
        return "Error: Could not extract text from this link. (Site might be blocking scrapers)"
    except Exception as e:
        return f"Error scraping link: {str(e)}"