# tools_processing.py import pytesseract from pdf2image import convert_from_path from PIL import Image import trafilatura # The best tool for scraping text from generic URLs import os # --- TOOL 1: THE EYES (OCR) --- def perform_ocr(file_obj): """ Converts PDF/Image to text using Tesseract. """ if file_obj is None: return None, "No file provided" try: # Handle PDF filename = os.path.basename(file_obj) if filename.lower().endswith(".pdf"): # Convert 1st page to image images = convert_from_path(file_obj, first_page=1, last_page=1) image = images[0] else: image = Image.open(file_obj).convert("RGB") # Run Tesseract text = pytesseract.image_to_string(image) return image, text except Exception as e: return None, f"OCR Failed: {str(e)}" # --- TOOL 2: THE BRAIN FEED (Web Scraper) --- def scrape_public_link(url): """ Fetches text from a public URL (Notion, Wiki, etc.) without API keys. """ if not url: return "" try: print(f"Scraping URL: {url}") downloaded = trafilatura.fetch_url(url) if downloaded: text = trafilatura.extract(downloaded) if text: return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---" return "Error: Could not extract text from this link. (Site might be blocking scrapers)" except Exception as e: return f"Error scraping link: {str(e)}"