Spaces:

Anmol4521
/

jansahayak

Running

File size: 4,568 Bytes

388aa42

"""

Document Processing Agent

Handles PDF and image text extraction

"""

import os
import pytesseract
from PIL import Image
from pypdf import PdfReader


def process_pdf(file_path: str) -> dict:
    """

    Extracts text from PDF file

    

    Args:

        file_path: Path to PDF file

        

    Returns:

        Dictionary with extracted text and metadata

    """
    try:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "text": ""}
        
        reader = PdfReader(file_path)
        text = ""
        
        for page_num, page in enumerate(reader.pages):
            page_text = page.extract_text()
            text += f"\n--- Page {page_num + 1} ---\n{page_text}"
        
        return {
            "file_path": file_path,
            "pages": len(reader.pages),
            "text": text,
            "success": True
        }
    
    except Exception as e:
        return {
            "error": str(e),
            "file_path": file_path,
            "text": "",
            "success": False
        }


def process_image(file_path: str, language: str = 'eng+hin') -> dict:
    """

    Extracts text from image using OCR

    

    Args:

        file_path: Path to image file

        language: Tesseract language code (default: English + Hindi)

        

    Returns:

        Dictionary with extracted text and metadata

    """
    try:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "text": ""}
        
        img = Image.open(file_path)
        text = pytesseract.image_to_string(img, lang=language)
        
        return {
            "file_path": file_path,
            "image_size": img.size,
            "text": text,
            "success": True
        }
    
    except Exception as e:
        return {
            "error": str(e),
            "file_path": file_path,
            "text": "",
            "success": False
        }


def process_resume(file_path: str) -> dict:
    """

    Processes resume (PDF or image) and extracts relevant information

    

    Args:

        file_path: Path to resume file

        

    Returns:

        Extracted resume information

    """
    file_ext = os.path.splitext(file_path)[1].lower()
    
    if file_ext == '.pdf':
        result = process_pdf(file_path)
    elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
        result = process_image(file_path)
    else:
        return {
            "error": f"Unsupported file format: {file_ext}",
            "text": "",
            "success": False
        }
    
    if result.get("success"):
        # Basic resume parsing (can be enhanced)
        text = result["text"]
        result["document_type"] = "resume"
        result["contains_email"] = "@" in text
        result["contains_phone"] = any(char.isdigit() for char in text)
    
    return result


def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
    """

    Processes multiple documents in a folder

    

    Args:

        folder_path: Path to folder containing documents

        file_type: Type of files to process ("pdf" or "image")

        

    Returns:

        List of processing results for each document

    """
    results = []
    
    if not os.path.exists(folder_path):
        return [{"error": f"Folder not found: {folder_path}"}]
    
    extensions = {
        "pdf": [".pdf"],
        "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
    }
    
    valid_extensions = extensions.get(file_type, [".pdf"])
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        file_ext = os.path.splitext(filename)[1].lower()
        
        if file_ext in valid_extensions:
            if file_type == "pdf":
                result = process_pdf(file_path)
            else:
                result = process_image(file_path)
            
            results.append(result)
    
    return results


if __name__ == "__main__":
    # Test the agent
    # Note: You'll need to provide actual file paths to test
    
    # Example usage
    print("Document Processing Agent")
    print("=" * 50)
    print("Available functions:")
    print("1. process_pdf(file_path)")
    print("2. process_image(file_path)")
    print("3. process_resume(file_path)")
    print("4. batch_process_documents(folder_path, file_type)")