Spaces:

vachaspathi
/

Agentic

Sleeping

Agentic / tools_processing.py

Create tools_processing.py

3f041f9 verified 4 months ago

1.57 kB

	# tools_processing.py
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import trafilatura # The best tool for scraping text from generic URLs
	import os

	# --- TOOL 1: THE EYES (OCR) ---
	def perform_ocr(file_obj):
	"""
	Converts PDF/Image to text using Tesseract.
	"""
	if file_obj is None:
	return None, "No file provided"

	try:
	# Handle PDF
	filename = os.path.basename(file_obj)
	if filename.lower().endswith(".pdf"):
	# Convert 1st page to image
	images = convert_from_path(file_obj, first_page=1, last_page=1)
	image = images[0]
	else:
	image = Image.open(file_obj).convert("RGB")

	# Run Tesseract
	text = pytesseract.image_to_string(image)
	return image, text

	except Exception as e:
	return None, f"OCR Failed: {str(e)}"

	# --- TOOL 2: THE BRAIN FEED (Web Scraper) ---
	def scrape_public_link(url):
	"""
	Fetches text from a public URL (Notion, Wiki, etc.) without API keys.
	"""
	if not url:
	return ""

	try:
	print(f"Scraping URL: {url}")
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	text = trafilatura.extract(downloaded)
	if text:
	return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---"
	return "Error: Could not extract text from this link. (Site might be blocking scrapers)"
	except Exception as e:
	return f"Error scraping link: {str(e)}"