adarsh commited on
Commit
75a48e8
·
1 Parent(s): a9c5023
Files changed (2) hide show
  1. app.py +202 -0
  2. requirements.txt +60 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import pdfplumber
4
+ import re
5
+ from datetime import datetime
6
+ import pytesseract
7
+ from PIL import Image
8
+ import io
9
+ import os
10
+ import cv2
11
+ import numpy as np
12
+ import tempfile
13
+
14
+ def preprocess_image(image):
15
+ gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
16
+ _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
17
+ return Image.fromarray(binary)
18
+
19
+ class DocumentAgeExtractor:
20
+ def __init__(self):
21
+ self.age_keywords = [
22
+ r'age[:\s]+(\d+)',
23
+ r'(\d+)\s+years?\s+old',
24
+ r'date\s+of\s+birth[:\s]+(\d{2}[-/]\d{2}[-/]\d{4})',
25
+ r'(?:dob|date\s+of\s+birth)[:\s]*(\d{2}[-/]\d{2}[-/]\d{4})',
26
+ r'born\s+on[:\s]+(\d{2}[-/]\d{2}[-/]\d{4})'
27
+ ]
28
+
29
+ def extract_age_from_pdf(self, pdf_path):
30
+ try:
31
+ with pdfplumber.open(pdf_path) as pdf:
32
+ text = ''
33
+ for page in pdf.pages:
34
+ text += page.extract_text() or ''
35
+
36
+ if page.images:
37
+ for img in page.images:
38
+ image_data = img['stream'].get_data()
39
+ image = Image.open(io.BytesIO(image_data))
40
+ text += pytesseract.image_to_string(image)
41
+
42
+ return self._process_text(text)
43
+
44
+ except Exception as e:
45
+ return {
46
+ 'success': False,
47
+ 'error': str(e),
48
+ 'age': None,
49
+ 'confidence': 0,
50
+ 'method': None
51
+ }
52
+
53
+ def _process_text(self, text):
54
+ result = {
55
+ 'success': False,
56
+ 'age': None,
57
+ 'confidence': 0,
58
+ 'method': None
59
+ }
60
+
61
+ for pattern in self.age_keywords[:2]:
62
+ matches = re.finditer(pattern, text.lower())
63
+ for match in matches:
64
+ age = int(match.group(1))
65
+ if 0 <= age <= 120:
66
+ result.update({
67
+ 'success': True,
68
+ 'age': age,
69
+ 'confidence': 0.9,
70
+ 'method': 'direct_mention'
71
+ })
72
+ return result
73
+
74
+ for pattern in self.age_keywords[2:]:
75
+ matches = re.finditer(pattern, text.lower())
76
+ for match in matches:
77
+ try:
78
+ dob_str = match.group(1)
79
+ for fmt in ['%d-%m-%Y', '%d/%m/%Y', '%m-%d-%Y', '%m/%d/%Y']:
80
+ try:
81
+ dob = datetime.strptime(dob_str, fmt)
82
+ age = self._calculate_age(dob)
83
+ result.update({
84
+ 'success': True,
85
+ 'age': age,
86
+ 'confidence': 0.85,
87
+ 'method': 'dob_calculation'
88
+ })
89
+ return result
90
+ except ValueError:
91
+ continue
92
+ except Exception:
93
+ continue
94
+
95
+ return result
96
+
97
+ def _calculate_age(self, dob):
98
+ today = datetime.today()
99
+ age = today.year - dob.year
100
+ if today.month < dob.month or (today.month == dob.month and today.day < dob.day):
101
+ age -= 1
102
+ return age
103
+
104
+ def process_pdf(pdf_file):
105
+ if pdf_file is None:
106
+ return {
107
+ "error": "Please upload a PDF file",
108
+ "age": None,
109
+ "confidence": None,
110
+ "method": None
111
+ }
112
+
113
+ try:
114
+ # Create a temporary file to save the uploaded PDF
115
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
116
+ temp_pdf.write(pdf_file)
117
+ temp_pdf_path = temp_pdf.name
118
+
119
+ # Initialize extractor and process the PDF
120
+ extractor = DocumentAgeExtractor()
121
+ result = extractor.extract_age_from_pdf(temp_pdf_path)
122
+
123
+ # Clean up the temporary file
124
+ os.unlink(temp_pdf_path)
125
+
126
+ if result['success']:
127
+ return {
128
+ "error": None,
129
+ "age": result['age'],
130
+ "confidence": f"{result['confidence']*100:.1f}%",
131
+ "method": result['method'].replace('_', ' ').title()
132
+ }
133
+ else:
134
+ return {
135
+ "error": "Could not extract age from the document",
136
+ "age": None,
137
+ "confidence": None,
138
+ "method": None
139
+ }
140
+
141
+ except Exception as e:
142
+ return {
143
+ "error": f"Error processing PDF: {str(e)}",
144
+ "age": None,
145
+ "confidence": None,
146
+ "method": None
147
+ }
148
+
149
+ # Create the Gradio interface
150
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
151
+ gr.Markdown(
152
+ """
153
+ # 📄 Document Age Extractor
154
+ Upload a PDF document containing age or date of birth information, and this tool will extract the person's age.
155
+
156
+ ### Supported Formats:
157
+ - Direct age mention (e.g., "age: 25", "30 years old")
158
+ - Date of birth (e.g., "DOB: 01-01-1990", "Born on: 01/01/1990")
159
+ """
160
+ )
161
+
162
+ with gr.Row():
163
+ with gr.Column():
164
+ pdf_input = gr.File(
165
+ label="Upload PDF Document",
166
+ file_types=[".pdf"],
167
+ type="binary"
168
+ )
169
+ submit_btn = gr.Button("Extract Age", variant="primary")
170
+
171
+ with gr.Column():
172
+ with gr.Group():
173
+ error_output = gr.Textbox(label="Status", interactive=False)
174
+ age_output = gr.Number(label="Extracted Age", interactive=False)
175
+ confidence_output = gr.Textbox(label="Confidence", interactive=False)
176
+ method_output = gr.Textbox(label="Extraction Method", interactive=False)
177
+
178
+ # Handle file upload and processing
179
+ submit_btn.click(
180
+ fn=process_pdf,
181
+ inputs=[pdf_input],
182
+ outputs=[
183
+ gr.JSON({
184
+ "error": error_output,
185
+ "age": age_output,
186
+ "confidence": confidence_output,
187
+ "method": method_output
188
+ })
189
+ ]
190
+ )
191
+
192
+ gr.Markdown(
193
+ """
194
+ ### Notes:
195
+ - The tool works best with clearly formatted documents
196
+ - Supports both text-based PDFs and PDFs containing images
197
+ - Higher confidence scores indicate more reliable extractions
198
+ """
199
+ )
200
+
201
+ if __name__ == "__main__":
202
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.8.0
4
+ certifi==2024.12.14
5
+ cffi==1.17.1
6
+ charset-normalizer==3.4.1
7
+ click==8.1.8
8
+ cryptography==44.0.0
9
+ exceptiongroup==1.2.2
10
+ fastapi==0.115.6
11
+ ffmpy==0.5.0
12
+ filelock==3.16.1
13
+ fsspec==2024.12.0
14
+ gradio==5.12.0
15
+ gradio_client==1.5.4
16
+ h11==0.14.0
17
+ httpcore==1.0.7
18
+ httpx==0.28.1
19
+ huggingface-hub==0.27.1
20
+ idna==3.10
21
+ Jinja2==3.1.5
22
+ markdown-it-py==3.0.0
23
+ MarkupSafe==2.1.5
24
+ mdurl==0.1.2
25
+ numpy==2.2.1
26
+ opencv-python==4.11.0.86
27
+ orjson==3.10.14
28
+ packaging==24.2
29
+ pandas==2.2.3
30
+ pdfminer.six==20231228
31
+ pdfplumber==0.11.5
32
+ pillow==11.1.0
33
+ pycparser==2.22
34
+ pydantic==2.10.5
35
+ pydantic_core==2.27.2
36
+ pydub==0.25.1
37
+ Pygments==2.19.1
38
+ pypdfium2==4.30.1
39
+ pytesseract==0.3.13
40
+ python-dateutil==2.9.0.post0
41
+ python-multipart==0.0.20
42
+ pytz==2024.2
43
+ PyYAML==6.0.2
44
+ requests==2.32.3
45
+ rich==13.9.4
46
+ ruff==0.9.2
47
+ safehttpx==0.1.6
48
+ semantic-version==2.10.0
49
+ shellingham==1.5.4
50
+ six==1.17.0
51
+ sniffio==1.3.1
52
+ starlette==0.41.3
53
+ tomlkit==0.13.2
54
+ tqdm==4.67.1
55
+ typer==0.15.1
56
+ typing_extensions==4.12.2
57
+ tzdata==2024.2
58
+ urllib3==2.3.0
59
+ uvicorn==0.34.0
60
+ websockets==14.1