Spaces:
Sleeping
Sleeping
| # tools_processing.py | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import trafilatura # The best tool for scraping text from generic URLs | |
| import os | |
| # --- TOOL 1: THE EYES (OCR) --- | |
| def perform_ocr(file_obj): | |
| """ | |
| Converts PDF/Image to text using Tesseract. | |
| """ | |
| if file_obj is None: | |
| return None, "No file provided" | |
| try: | |
| # Handle PDF | |
| filename = os.path.basename(file_obj) | |
| if filename.lower().endswith(".pdf"): | |
| # Convert 1st page to image | |
| images = convert_from_path(file_obj, first_page=1, last_page=1) | |
| image = images[0] | |
| else: | |
| image = Image.open(file_obj).convert("RGB") | |
| # Run Tesseract | |
| text = pytesseract.image_to_string(image) | |
| return image, text | |
| except Exception as e: | |
| return None, f"OCR Failed: {str(e)}" | |
| # --- TOOL 2: THE BRAIN FEED (Web Scraper) --- | |
| def scrape_public_link(url): | |
| """ | |
| Fetches text from a public URL (Notion, Wiki, etc.) without API keys. | |
| """ | |
| if not url: | |
| return "" | |
| try: | |
| print(f"Scraping URL: {url}") | |
| downloaded = trafilatura.fetch_url(url) | |
| if downloaded: | |
| text = trafilatura.extract(downloaded) | |
| if text: | |
| return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---" | |
| return "Error: Could not extract text from this link. (Site might be blocking scrapers)" | |
| except Exception as e: | |
| return f"Error scraping link: {str(e)}" |