File size: 1,569 Bytes
3f041f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# tools_processing.py
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import trafilatura # The best tool for scraping text from generic URLs
import os

# --- TOOL 1: THE EYES (OCR) ---
def perform_ocr(file_obj):
    """
    Converts PDF/Image to text using Tesseract.
    """
    if file_obj is None:
        return None, "No file provided"

    try:
        # Handle PDF
        filename = os.path.basename(file_obj)
        if filename.lower().endswith(".pdf"):
            # Convert 1st page to image
            images = convert_from_path(file_obj, first_page=1, last_page=1)
            image = images[0]
        else:
            image = Image.open(file_obj).convert("RGB")
            
        # Run Tesseract
        text = pytesseract.image_to_string(image)
        return image, text
        
    except Exception as e:
        return None, f"OCR Failed: {str(e)}"

# --- TOOL 2: THE BRAIN FEED (Web Scraper) ---
def scrape_public_link(url):
    """
    Fetches text from a public URL (Notion, Wiki, etc.) without API keys.
    """
    if not url:
        return ""
    
    try:
        print(f"Scraping URL: {url}")
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded)
            if text:
                return f"--- START EXTERNAL CONTEXT ({url}) ---\n{text}\n--- END CONTEXT ---"
        return "Error: Could not extract text from this link. (Site might be blocking scrapers)"
    except Exception as e:
        return f"Error scraping link: {str(e)}"