Spaces:

Anmol4521
/

jansahayak

Running

App Files Files Community

jansahayak / agents /document_agent.py

Anmol4521

Upload 95 files

388aa42 verified 9 days ago

raw

history blame contribute delete

4.57 kB

	"""
	Document Processing Agent
	Handles PDF and image text extraction
	"""

	import os
	import pytesseract
	from PIL import Image
	from pypdf import PdfReader


	def process_pdf(file_path: str) -> dict:
	"""
	Extracts text from PDF file

	Args:
	file_path: Path to PDF file

	Returns:
	Dictionary with extracted text and metadata
	"""
	try:
	if not os.path.exists(file_path):
	return {"error": f"File not found: {file_path}", "text": ""}

	reader = PdfReader(file_path)
	text = ""

	for page_num, page in enumerate(reader.pages):
	page_text = page.extract_text()
	text += f"\n--- Page {page_num + 1} ---\n{page_text}"

	return {
	"file_path": file_path,
	"pages": len(reader.pages),
	"text": text,
	"success": True
	}

	except Exception as e:
	return {
	"error": str(e),
	"file_path": file_path,
	"text": "",
	"success": False
	}


	def process_image(file_path: str, language: str = 'eng+hin') -> dict:
	"""
	Extracts text from image using OCR

	Args:
	file_path: Path to image file
	language: Tesseract language code (default: English + Hindi)

	Returns:
	Dictionary with extracted text and metadata
	"""
	try:
	if not os.path.exists(file_path):
	return {"error": f"File not found: {file_path}", "text": ""}

	img = Image.open(file_path)
	text = pytesseract.image_to_string(img, lang=language)

	return {
	"file_path": file_path,
	"image_size": img.size,
	"text": text,
	"success": True
	}

	except Exception as e:
	return {
	"error": str(e),
	"file_path": file_path,
	"text": "",
	"success": False
	}


	def process_resume(file_path: str) -> dict:
	"""
	Processes resume (PDF or image) and extracts relevant information

	Args:
	file_path: Path to resume file

	Returns:
	Extracted resume information
	"""
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	result = process_pdf(file_path)
	elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
	result = process_image(file_path)
	else:
	return {
	"error": f"Unsupported file format: {file_ext}",
	"text": "",
	"success": False
	}

	if result.get("success"):
	# Basic resume parsing (can be enhanced)
	text = result["text"]
	result["document_type"] = "resume"
	result["contains_email"] = "@" in text
	result["contains_phone"] = any(char.isdigit() for char in text)

	return result


	def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
	"""
	Processes multiple documents in a folder

	Args:
	folder_path: Path to folder containing documents
	file_type: Type of files to process ("pdf" or "image")

	Returns:
	List of processing results for each document
	"""
	results = []

	if not os.path.exists(folder_path):
	return [{"error": f"Folder not found: {folder_path}"}]

	extensions = {
	"pdf": [".pdf"],
	"image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
	}

	valid_extensions = extensions.get(file_type, [".pdf"])

	for filename in os.listdir(folder_path):
	file_path = os.path.join(folder_path, filename)
	file_ext = os.path.splitext(filename)[1].lower()

	if file_ext in valid_extensions:
	if file_type == "pdf":
	result = process_pdf(file_path)
	else:
	result = process_image(file_path)

	results.append(result)

	return results


	if __name__ == "__main__":
	# Test the agent
	# Note: You'll need to provide actual file paths to test

	# Example usage
	print("Document Processing Agent")
	print("=" * 50)
	print("Available functions:")
	print("1. process_pdf(file_path)")
	print("2. process_image(file_path)")
	print("3. process_resume(file_path)")
	print("4. batch_process_documents(folder_path, file_type)")