jansahayak / agents /document_agent.py
Anmol4521's picture
Upload 95 files
388aa42 verified
"""
Document Processing Agent
Handles PDF and image text extraction
"""
import os
import pytesseract
from PIL import Image
from pypdf import PdfReader
def process_pdf(file_path: str) -> dict:
"""
Extracts text from PDF file
Args:
file_path: Path to PDF file
Returns:
Dictionary with extracted text and metadata
"""
try:
if not os.path.exists(file_path):
return {"error": f"File not found: {file_path}", "text": ""}
reader = PdfReader(file_path)
text = ""
for page_num, page in enumerate(reader.pages):
page_text = page.extract_text()
text += f"\n--- Page {page_num + 1} ---\n{page_text}"
return {
"file_path": file_path,
"pages": len(reader.pages),
"text": text,
"success": True
}
except Exception as e:
return {
"error": str(e),
"file_path": file_path,
"text": "",
"success": False
}
def process_image(file_path: str, language: str = 'eng+hin') -> dict:
"""
Extracts text from image using OCR
Args:
file_path: Path to image file
language: Tesseract language code (default: English + Hindi)
Returns:
Dictionary with extracted text and metadata
"""
try:
if not os.path.exists(file_path):
return {"error": f"File not found: {file_path}", "text": ""}
img = Image.open(file_path)
text = pytesseract.image_to_string(img, lang=language)
return {
"file_path": file_path,
"image_size": img.size,
"text": text,
"success": True
}
except Exception as e:
return {
"error": str(e),
"file_path": file_path,
"text": "",
"success": False
}
def process_resume(file_path: str) -> dict:
"""
Processes resume (PDF or image) and extracts relevant information
Args:
file_path: Path to resume file
Returns:
Extracted resume information
"""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
result = process_pdf(file_path)
elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
result = process_image(file_path)
else:
return {
"error": f"Unsupported file format: {file_ext}",
"text": "",
"success": False
}
if result.get("success"):
# Basic resume parsing (can be enhanced)
text = result["text"]
result["document_type"] = "resume"
result["contains_email"] = "@" in text
result["contains_phone"] = any(char.isdigit() for char in text)
return result
def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
"""
Processes multiple documents in a folder
Args:
folder_path: Path to folder containing documents
file_type: Type of files to process ("pdf" or "image")
Returns:
List of processing results for each document
"""
results = []
if not os.path.exists(folder_path):
return [{"error": f"Folder not found: {folder_path}"}]
extensions = {
"pdf": [".pdf"],
"image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
}
valid_extensions = extensions.get(file_type, [".pdf"])
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
file_ext = os.path.splitext(filename)[1].lower()
if file_ext in valid_extensions:
if file_type == "pdf":
result = process_pdf(file_path)
else:
result = process_image(file_path)
results.append(result)
return results
if __name__ == "__main__":
# Test the agent
# Note: You'll need to provide actual file paths to test
# Example usage
print("Document Processing Agent")
print("=" * 50)
print("Available functions:")
print("1. process_pdf(file_path)")
print("2. process_image(file_path)")
print("3. process_resume(file_path)")
print("4. batch_process_documents(folder_path, file_type)")