Agentic / tools_processing.py
vachaspathi's picture
Create tools_processing.py
3f041f9 verified
# tools_processing.py
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import trafilatura # The best tool for scraping text from generic URLs
import os
# --- TOOL 1: THE EYES (OCR) ---
def perform_ocr(file_obj):
"""
Converts PDF/Image to text using Tesseract.
"""
if file_obj is None:
return None, "No file provided"
try:
# Handle PDF
filename = os.path.basename(file_obj)
if filename.lower().endswith(".pdf"):
# Convert 1st page to image
images = convert_from_path(file_obj, first_page=1, last_page=1)
image = images[0]
else:
image = Image.open(file_obj).convert("RGB")
# Run Tesseract
text = pytesseract.image_to_string(image)
return image, text
except Exception as e:
return None, f"OCR Failed: {str(e)}"
# --- TOOL 2: THE BRAIN FEED (Web Scraper) ---
def scrape_public_link(url):
"""
Fetches text from a public URL (Notion, Wiki, etc.) without API keys.
"""
if not url:
return ""
try:
print(f"Scraping URL: {url}")
downloaded = trafilatura.fetch_url(url)
if downloaded:
text = trafilatura.extract(downloaded)
if text:
return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---"
return "Error: Could not extract text from this link. (Site might be blocking scrapers)"
except Exception as e:
return f"Error scraping link: {str(e)}"