Spaces:
Running
Running
File size: 4,568 Bytes
388aa42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
Document Processing Agent
Handles PDF and image text extraction
"""
import os
import pytesseract
from PIL import Image
from pypdf import PdfReader
def process_pdf(file_path: str) -> dict:
"""
Extracts text from PDF file
Args:
file_path: Path to PDF file
Returns:
Dictionary with extracted text and metadata
"""
try:
if not os.path.exists(file_path):
return {"error": f"File not found: {file_path}", "text": ""}
reader = PdfReader(file_path)
text = ""
for page_num, page in enumerate(reader.pages):
page_text = page.extract_text()
text += f"\n--- Page {page_num + 1} ---\n{page_text}"
return {
"file_path": file_path,
"pages": len(reader.pages),
"text": text,
"success": True
}
except Exception as e:
return {
"error": str(e),
"file_path": file_path,
"text": "",
"success": False
}
def process_image(file_path: str, language: str = 'eng+hin') -> dict:
"""
Extracts text from image using OCR
Args:
file_path: Path to image file
language: Tesseract language code (default: English + Hindi)
Returns:
Dictionary with extracted text and metadata
"""
try:
if not os.path.exists(file_path):
return {"error": f"File not found: {file_path}", "text": ""}
img = Image.open(file_path)
text = pytesseract.image_to_string(img, lang=language)
return {
"file_path": file_path,
"image_size": img.size,
"text": text,
"success": True
}
except Exception as e:
return {
"error": str(e),
"file_path": file_path,
"text": "",
"success": False
}
def process_resume(file_path: str) -> dict:
"""
Processes resume (PDF or image) and extracts relevant information
Args:
file_path: Path to resume file
Returns:
Extracted resume information
"""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
result = process_pdf(file_path)
elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
result = process_image(file_path)
else:
return {
"error": f"Unsupported file format: {file_ext}",
"text": "",
"success": False
}
if result.get("success"):
# Basic resume parsing (can be enhanced)
text = result["text"]
result["document_type"] = "resume"
result["contains_email"] = "@" in text
result["contains_phone"] = any(char.isdigit() for char in text)
return result
def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
"""
Processes multiple documents in a folder
Args:
folder_path: Path to folder containing documents
file_type: Type of files to process ("pdf" or "image")
Returns:
List of processing results for each document
"""
results = []
if not os.path.exists(folder_path):
return [{"error": f"Folder not found: {folder_path}"}]
extensions = {
"pdf": [".pdf"],
"image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
}
valid_extensions = extensions.get(file_type, [".pdf"])
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
file_ext = os.path.splitext(filename)[1].lower()
if file_ext in valid_extensions:
if file_type == "pdf":
result = process_pdf(file_path)
else:
result = process_image(file_path)
results.append(result)
return results
if __name__ == "__main__":
# Test the agent
# Note: You'll need to provide actual file paths to test
# Example usage
print("Document Processing Agent")
print("=" * 50)
print("Available functions:")
print("1. process_pdf(file_path)")
print("2. process_image(file_path)")
print("3. process_resume(file_path)")
print("4. batch_process_documents(folder_path, file_type)")
|