File size: 4,568 Bytes
388aa42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""

Document Processing Agent

Handles PDF and image text extraction

"""

import os
import pytesseract
from PIL import Image
from pypdf import PdfReader


def process_pdf(file_path: str) -> dict:
    """

    Extracts text from PDF file

    

    Args:

        file_path: Path to PDF file

        

    Returns:

        Dictionary with extracted text and metadata

    """
    try:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "text": ""}
        
        reader = PdfReader(file_path)
        text = ""
        
        for page_num, page in enumerate(reader.pages):
            page_text = page.extract_text()
            text += f"\n--- Page {page_num + 1} ---\n{page_text}"
        
        return {
            "file_path": file_path,
            "pages": len(reader.pages),
            "text": text,
            "success": True
        }
    
    except Exception as e:
        return {
            "error": str(e),
            "file_path": file_path,
            "text": "",
            "success": False
        }


def process_image(file_path: str, language: str = 'eng+hin') -> dict:
    """

    Extracts text from image using OCR

    

    Args:

        file_path: Path to image file

        language: Tesseract language code (default: English + Hindi)

        

    Returns:

        Dictionary with extracted text and metadata

    """
    try:
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "text": ""}
        
        img = Image.open(file_path)
        text = pytesseract.image_to_string(img, lang=language)
        
        return {
            "file_path": file_path,
            "image_size": img.size,
            "text": text,
            "success": True
        }
    
    except Exception as e:
        return {
            "error": str(e),
            "file_path": file_path,
            "text": "",
            "success": False
        }


def process_resume(file_path: str) -> dict:
    """

    Processes resume (PDF or image) and extracts relevant information

    

    Args:

        file_path: Path to resume file

        

    Returns:

        Extracted resume information

    """
    file_ext = os.path.splitext(file_path)[1].lower()
    
    if file_ext == '.pdf':
        result = process_pdf(file_path)
    elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
        result = process_image(file_path)
    else:
        return {
            "error": f"Unsupported file format: {file_ext}",
            "text": "",
            "success": False
        }
    
    if result.get("success"):
        # Basic resume parsing (can be enhanced)
        text = result["text"]
        result["document_type"] = "resume"
        result["contains_email"] = "@" in text
        result["contains_phone"] = any(char.isdigit() for char in text)
    
    return result


def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
    """

    Processes multiple documents in a folder

    

    Args:

        folder_path: Path to folder containing documents

        file_type: Type of files to process ("pdf" or "image")

        

    Returns:

        List of processing results for each document

    """
    results = []
    
    if not os.path.exists(folder_path):
        return [{"error": f"Folder not found: {folder_path}"}]
    
    extensions = {
        "pdf": [".pdf"],
        "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
    }
    
    valid_extensions = extensions.get(file_type, [".pdf"])
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        file_ext = os.path.splitext(filename)[1].lower()
        
        if file_ext in valid_extensions:
            if file_type == "pdf":
                result = process_pdf(file_path)
            else:
                result = process_image(file_path)
            
            results.append(result)
    
    return results


if __name__ == "__main__":
    # Test the agent
    # Note: You'll need to provide actual file paths to test
    
    # Example usage
    print("Document Processing Agent")
    print("=" * 50)
    print("Available functions:")
    print("1. process_pdf(file_path)")
    print("2. process_image(file_path)")
    print("3. process_resume(file_path)")
    print("4. batch_process_documents(folder_path, file_type)")