Spaces:
Running
Running
| """ | |
| Document Processing Agent | |
| Handles PDF and image text extraction | |
| """ | |
| import os | |
| import pytesseract | |
| from PIL import Image | |
| from pypdf import PdfReader | |
| def process_pdf(file_path: str) -> dict: | |
| """ | |
| Extracts text from PDF file | |
| Args: | |
| file_path: Path to PDF file | |
| Returns: | |
| Dictionary with extracted text and metadata | |
| """ | |
| try: | |
| if not os.path.exists(file_path): | |
| return {"error": f"File not found: {file_path}", "text": ""} | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page_num, page in enumerate(reader.pages): | |
| page_text = page.extract_text() | |
| text += f"\n--- Page {page_num + 1} ---\n{page_text}" | |
| return { | |
| "file_path": file_path, | |
| "pages": len(reader.pages), | |
| "text": text, | |
| "success": True | |
| } | |
| except Exception as e: | |
| return { | |
| "error": str(e), | |
| "file_path": file_path, | |
| "text": "", | |
| "success": False | |
| } | |
| def process_image(file_path: str, language: str = 'eng+hin') -> dict: | |
| """ | |
| Extracts text from image using OCR | |
| Args: | |
| file_path: Path to image file | |
| language: Tesseract language code (default: English + Hindi) | |
| Returns: | |
| Dictionary with extracted text and metadata | |
| """ | |
| try: | |
| if not os.path.exists(file_path): | |
| return {"error": f"File not found: {file_path}", "text": ""} | |
| img = Image.open(file_path) | |
| text = pytesseract.image_to_string(img, lang=language) | |
| return { | |
| "file_path": file_path, | |
| "image_size": img.size, | |
| "text": text, | |
| "success": True | |
| } | |
| except Exception as e: | |
| return { | |
| "error": str(e), | |
| "file_path": file_path, | |
| "text": "", | |
| "success": False | |
| } | |
| def process_resume(file_path: str) -> dict: | |
| """ | |
| Processes resume (PDF or image) and extracts relevant information | |
| Args: | |
| file_path: Path to resume file | |
| Returns: | |
| Extracted resume information | |
| """ | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext == '.pdf': | |
| result = process_pdf(file_path) | |
| elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']: | |
| result = process_image(file_path) | |
| else: | |
| return { | |
| "error": f"Unsupported file format: {file_ext}", | |
| "text": "", | |
| "success": False | |
| } | |
| if result.get("success"): | |
| # Basic resume parsing (can be enhanced) | |
| text = result["text"] | |
| result["document_type"] = "resume" | |
| result["contains_email"] = "@" in text | |
| result["contains_phone"] = any(char.isdigit() for char in text) | |
| return result | |
| def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list: | |
| """ | |
| Processes multiple documents in a folder | |
| Args: | |
| folder_path: Path to folder containing documents | |
| file_type: Type of files to process ("pdf" or "image") | |
| Returns: | |
| List of processing results for each document | |
| """ | |
| results = [] | |
| if not os.path.exists(folder_path): | |
| return [{"error": f"Folder not found: {folder_path}"}] | |
| extensions = { | |
| "pdf": [".pdf"], | |
| "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"] | |
| } | |
| valid_extensions = extensions.get(file_type, [".pdf"]) | |
| for filename in os.listdir(folder_path): | |
| file_path = os.path.join(folder_path, filename) | |
| file_ext = os.path.splitext(filename)[1].lower() | |
| if file_ext in valid_extensions: | |
| if file_type == "pdf": | |
| result = process_pdf(file_path) | |
| else: | |
| result = process_image(file_path) | |
| results.append(result) | |
| return results | |
| if __name__ == "__main__": | |
| # Test the agent | |
| # Note: You'll need to provide actual file paths to test | |
| # Example usage | |
| print("Document Processing Agent") | |
| print("=" * 50) | |
| print("Available functions:") | |
| print("1. process_pdf(file_path)") | |
| print("2. process_image(file_path)") | |
| print("3. process_resume(file_path)") | |
| print("4. batch_process_documents(folder_path, file_type)") | |