Spaces:

4darsh-Dev
/

auto_doc_age

Build error

App Files Files Community

adarsh commited on Jan 19, 2025

Commit

75a48e8

1 Parent(s): a9c5023

init

Browse files

Files changed (2) hide show

app.py +202 -0
requirements.txt +60 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import gradio as gr
+import pdfplumber
+import re
+from datetime import datetime
+import pytesseract
+from PIL import Image
+import io
+import os
+import cv2
+import numpy as np
+import tempfile
+def preprocess_image(image):
+    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
+    return Image.fromarray(binary)
+class DocumentAgeExtractor:
+    def __init__(self):
+        self.age_keywords = [
+            r'age[:\s]+(\d+)',
+            r'(\d+)\s+years?\s+old',
+            r'date\s+of\s+birth[:\s]+(\d{2}[-/]\d{2}[-/]\d{4})',
+            r'(?:dob|date\s+of\s+birth)[:\s]*(\d{2}[-/]\d{2}[-/]\d{4})',
+            r'born\s+on[:\s]+(\d{2}[-/]\d{2}[-/]\d{4})'
+        ]
+    def extract_age_from_pdf(self, pdf_path):
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                text = ''
+                for page in pdf.pages:
+                    text += page.extract_text() or ''
+                    if page.images:
+                        for img in page.images:
+                            image_data = img['stream'].get_data()
+                            image = Image.open(io.BytesIO(image_data))
+                            text += pytesseract.image_to_string(image)
+                return self._process_text(text)
+        except Exception as e:
+            return {
+                'success': False,
+                'error': str(e),
+                'age': None,
+                'confidence': 0,
+                'method': None
+            }
+    def _process_text(self, text):
+        result = {
+            'success': False,
+            'age': None,
+            'confidence': 0,
+            'method': None
+        }
+        for pattern in self.age_keywords[:2]:
+            matches = re.finditer(pattern, text.lower())
+            for match in matches:
+                age = int(match.group(1))
+                if 0 <= age <= 120:
+                    result.update({
+                        'success': True,
+                        'age': age,
+                        'confidence': 0.9,
+                        'method': 'direct_mention'
+                    })
+                    return result
+        for pattern in self.age_keywords[2:]:
+            matches = re.finditer(pattern, text.lower())
+            for match in matches:
+                try:
+                    dob_str = match.group(1)
+                    for fmt in ['%d-%m-%Y', '%d/%m/%Y', '%m-%d-%Y', '%m/%d/%Y']:
+                        try:
+                            dob = datetime.strptime(dob_str, fmt)
+                            age = self._calculate_age(dob)
+                            result.update({
+                                'success': True,
+                                'age': age,
+                                'confidence': 0.85,
+                                'method': 'dob_calculation'
+                            })
+                            return result
+                        except ValueError:
+                            continue
+                except Exception:
+                    continue
+        return result
+    def _calculate_age(self, dob):
+        today = datetime.today()
+        age = today.year - dob.year
+        if today.month < dob.month or (today.month == dob.month and today.day < dob.day):
+            age -= 1
+        return age
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return {
+            "error": "Please upload a PDF file",
+            "age": None,
+            "confidence": None,
+            "method": None
+        }
+    try:
+        # Create a temporary file to save the uploaded PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+            temp_pdf.write(pdf_file)
+            temp_pdf_path = temp_pdf.name
+        # Initialize extractor and process the PDF
+        extractor = DocumentAgeExtractor()
+        result = extractor.extract_age_from_pdf(temp_pdf_path)
+        # Clean up the temporary file
+        os.unlink(temp_pdf_path)
+        if result['success']:
+            return {
+                "error": None,
+                "age": result['age'],
+                "confidence": f"{result['confidence']*100:.1f}%",
+                "method": result['method'].replace('_', ' ').title()
+            }
+        else:
+            return {
+                "error": "Could not extract age from the document",
+                "age": None,
+                "confidence": None,
+                "method": None
+            }
+    except Exception as e:
+        return {
+            "error": f"Error processing PDF: {str(e)}",
+            "age": None,
+            "confidence": None,
+            "method": None
+        }
+# Create the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown(
+        """
+        # 📄 Document Age Extractor
+        Upload a PDF document containing age or date of birth information, and this tool will extract the person's age.
+        ### Supported Formats:
+        - Direct age mention (e.g., "age: 25", "30 years old")
+        - Date of birth (e.g., "DOB: 01-01-1990", "Born on: 01/01/1990")
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            pdf_input = gr.File(
+                label="Upload PDF Document",
+                file_types=[".pdf"],
+                type="binary"
+            )
+            submit_btn = gr.Button("Extract Age", variant="primary")
+        with gr.Column():
+            with gr.Group():
+                error_output = gr.Textbox(label="Status", interactive=False)
+                age_output = gr.Number(label="Extracted Age", interactive=False)
+                confidence_output = gr.Textbox(label="Confidence", interactive=False)
+                method_output = gr.Textbox(label="Extraction Method", interactive=False)
+    # Handle file upload and processing
+    submit_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input],
+        outputs=[
+            gr.JSON({
+                "error": error_output,
+                "age": age_output,
+                "confidence": confidence_output,
+                "method": method_output
+            })
+        ]
+    )
+    gr.Markdown(
+        """
+        ### Notes:
+        - The tool works best with clearly formatted documents
+        - Supports both text-based PDFs and PDFs containing images
+        - Higher confidence scores indicate more reliable extractions
+        """
+    )
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,60 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.8.0
+certifi==2024.12.14
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+cryptography==44.0.0
+exceptiongroup==1.2.2
+fastapi==0.115.6
+ffmpy==0.5.0
+filelock==3.16.1
+fsspec==2024.12.0
+gradio==5.12.0
+gradio_client==1.5.4
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.1
+idna==3.10
+Jinja2==3.1.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+numpy==2.2.1
+opencv-python==4.11.0.86
+orjson==3.10.14
+packaging==24.2
+pandas==2.2.3
+pdfminer.six==20231228
+pdfplumber==0.11.5
+pillow==11.1.0
+pycparser==2.22
+pydantic==2.10.5
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pypdfium2==4.30.1
+pytesseract==0.3.13
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.2
+PyYAML==6.0.2
+requests==2.32.3
+rich==13.9.4
+ruff==0.9.2
+safehttpx==0.1.6
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.41.3
+tomlkit==0.13.2
+tqdm==4.67.1
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+websockets==14.1