Rivalcoder commited on
Commit
4b022af
·
1 Parent(s): a9ff187
Dockerfile ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 slim image as base
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies required for PDF processing and OCR
8
+ RUN apt-get update && apt-get install -y \
9
+ tesseract-ocr \
10
+ tesseract-ocr-eng \
11
+ tesseract-ocr-hin \
12
+ tesseract-ocr-ben \
13
+ tesseract-ocr-tam \
14
+ tesseract-ocr-tel \
15
+ tesseract-ocr-guj \
16
+ tesseract-ocr-mar \
17
+ tesseract-ocr-pan \
18
+ tesseract-ocr-ori \
19
+ tesseract-ocr-asm \
20
+ tesseract-ocr-mal \
21
+ tesseract-ocr-kan \
22
+ libgl1-mesa-glx \
23
+ libglib2.0-0 \
24
+ libsm6 \
25
+ libxext6 \
26
+ libxrender-dev \
27
+ libgomp1 \
28
+ libgcc-s1 \
29
+ poppler-utils \
30
+ && rm -rf /var/lib/apt/lists/*
31
+
32
+ # Copy requirements first for better Docker layer caching
33
+ COPY requirements.txt .
34
+
35
+ # Install Python dependencies
36
+ RUN pip install --no-cache-dir -r requirements.txt
37
+
38
+ # Copy application code
39
+ COPY app/ ./app/
40
+
41
+ # Create a non-root user for security
42
+ RUN useradd --create-home --shell /bin/bash app \
43
+ && chown -R app:app /app
44
+ USER app
45
+
46
+ # Expose port 8000
47
+ EXPOSE 8000
48
+
49
+ # Set environment variables
50
+ ENV PYTHONPATH=/app
51
+ ENV PYTHONUNBUFFERED=1
52
+
53
+ # Health check
54
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
55
+ CMD curl -f http://localhost:8000/health || exit 1
56
+
57
+ # Run the application
58
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (167 Bytes). View file
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (142 Bytes). View file
 
app/__pycache__/config.cpython-311.pyc ADDED
Binary file (843 Bytes). View file
 
app/__pycache__/config.cpython-312.pyc ADDED
Binary file (704 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (6.65 kB). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (5.72 kB). View file
 
app/__pycache__/schemas.cpython-311.pyc ADDED
Binary file (1.49 kB). View file
 
app/__pycache__/schemas.cpython-312.pyc ADDED
Binary file (1.19 kB). View file
 
app/config.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables from .env file (if it exists)
5
+ load_dotenv()
6
+
7
+ # Get the Gemini API key from environment variables
8
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
9
+
10
+ # Settings object for easy access
11
+ class Settings:
12
+ GEMINI_API_KEY: str = GEMINI_API_KEY
13
+ LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
14
+ MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB default
15
+
16
+ def validate(self):
17
+ """Validate required settings"""
18
+ if not self.GEMINI_API_KEY:
19
+ raise ValueError(
20
+ "GEMINI_API_KEY not found in environment variables. "
21
+ "Please set it in your environment or .env file."
22
+ )
23
+
24
+ settings = Settings()
25
+
26
+ # Validate settings on import (only if GEMINI_API_KEY is set)
27
+ if GEMINI_API_KEY:
28
+ settings.validate()
app/data/Basic-NDA.pdf ADDED
Binary file (75.3 kB). View file
 
app/data/Basic-Non-Disclosure-Agreement.pdf ADDED
Binary file (71.7 kB). View file
 
app/main.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ import tempfile
4
+ import os
5
+ from typing import List
6
+
7
+ # Import our services
8
+ from app.services.text_extractor import extract_text_from_pdf
9
+ from app.services.preprocessor import segment_into_clauses
10
+ from app.services.risk_analyzer import analyze_clause_with_gemini
11
+ from app.services.risk_scorer import calculate_scores, get_risk_definition
12
+
13
+ from app.schemas import AnalysisReport, AnalyzedClause, RiskFinding
14
+
15
+ # Create FastAPI app instance
16
+ app = FastAPI(
17
+ title="Multilingual Legal Contract Analyzer",
18
+ description="AI-powered contract analysis for English and Indic languages",
19
+ version="1.0.0"
20
+ )
21
+
22
+
23
+ @app.post("/analyze/", response_model=AnalysisReport)
24
+ async def analyze_contract(file: UploadFile = File(...)):
25
+ """
26
+ Analyze a legal contract PDF and return detailed risk analysis.
27
+
28
+ Args:
29
+ file: PDF file to analyze
30
+
31
+ Returns:
32
+ AnalysisReport with risk analysis and suggestions
33
+ """
34
+
35
+ # Validate file type
36
+ if not file.filename.lower().endswith('.pdf'):
37
+ raise HTTPException(
38
+ status_code=400, detail="Only PDF files are supported")
39
+
40
+ # Create temporary file to store uploaded PDF
41
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
42
+ try:
43
+ # Write uploaded file to temporary file
44
+ content = await file.read()
45
+ temp_file.write(content)
46
+ temp_file.flush()
47
+
48
+ # Step 1: Extract text from PDF
49
+ print(f"Extracting text from {file.filename}...")
50
+ full_text = extract_text_from_pdf(temp_file.name)
51
+
52
+ if not full_text or len(full_text.strip()) < 50:
53
+ raise HTTPException(
54
+ status_code=400,
55
+ detail="Unable to extract meaningful text from PDF. Please ensure the PDF is readable."
56
+ )
57
+
58
+ # Step 2: Segment text into clauses
59
+ print("Segmenting text into clauses...")
60
+ clauses = segment_into_clauses(full_text)
61
+
62
+ if not clauses:
63
+ raise HTTPException(
64
+ status_code=400,
65
+ detail="Unable to identify contract clauses. Please ensure the document is a valid contract."
66
+ )
67
+
68
+ # Step 3: Analyze each clause with Gemini AI
69
+ print(f"Analyzing {len(clauses)} clauses with AI...")
70
+ analyzed_clauses = []
71
+
72
+ for i, clause_text in enumerate(clauses, 1):
73
+ print(f"Analyzing clause {i}/{len(clauses)}...")
74
+
75
+ # Get AI analysis
76
+ ai_result = analyze_clause_with_gemini(clause_text)
77
+
78
+ # Convert AI results to RiskFinding objects
79
+ risks = []
80
+ for risk_data in ai_result.get("risks", []):
81
+ risk_id = risk_data.get("risk_id")
82
+ if risk_id:
83
+ risk_def = get_risk_definition(risk_id)
84
+ risk_finding = RiskFinding(
85
+ risk_id=risk_id,
86
+ description=risk_data.get(
87
+ "explanation", risk_def["description"]),
88
+ score=risk_def["score"]
89
+ )
90
+ risks.append(risk_finding)
91
+
92
+ # Create AnalyzedClause object
93
+ analyzed_clause = AnalyzedClause(
94
+ clause_number=i,
95
+ # Truncate for response
96
+ text=clause_text[:500] +
97
+ "..." if len(clause_text) > 500 else clause_text,
98
+ risks=risks,
99
+ suggestion=ai_result.get("suggestion")
100
+ )
101
+ analyzed_clauses.append(analyzed_clause)
102
+
103
+ # Step 4: Calculate final risk score
104
+ print("Calculating final risk score...")
105
+ final_score, all_findings = calculate_scores(analyzed_clauses)
106
+
107
+ # Step 5: Determine contract type and language (basic detection)
108
+ contract_type = "General Contract" # Could be enhanced with AI detection
109
+ language = "English" # Could be enhanced with language detection
110
+
111
+ # Create final analysis report
112
+ analysis_report = AnalysisReport(
113
+ file_name=file.filename,
114
+ language=language,
115
+ contract_type=contract_type,
116
+ final_risk_score=final_score,
117
+ clauses=analyzed_clauses
118
+ )
119
+
120
+ print(f"Analysis complete. Final risk score: {final_score}")
121
+ return analysis_report
122
+
123
+ except HTTPException:
124
+ raise
125
+ except Exception as e:
126
+ print(f"Error during analysis: {e}")
127
+ raise HTTPException(
128
+ status_code=500,
129
+ detail=f"Analysis failed: {str(e)}"
130
+ )
131
+ finally:
132
+ # Clean up temporary file
133
+ try:
134
+ os.unlink(temp_file.name)
135
+ except:
136
+ pass
137
+
138
+
139
+ @app.get("/")
140
+ async def root():
141
+ """Health check endpoint"""
142
+ return {"message": "Multilingual Legal Contract Analyzer API is running"}
143
+
144
+
145
+ @app.get("/health")
146
+ async def health_check():
147
+ """Health check endpoint"""
148
+ return {"status": "healthy", "service": "contract-analyzer"}
149
+
150
+
151
+ if __name__ == "__main__":
152
+ import uvicorn
153
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/schemas.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional
3
+
4
+
5
+ class RiskFinding(BaseModel):
6
+ risk_id: str
7
+ description: str
8
+ score: int
9
+
10
+
11
+ class AnalyzedClause(BaseModel):
12
+ clause_number: int
13
+ text: str
14
+ risks: List[RiskFinding]
15
+ suggestion: Optional[str] = None
16
+
17
+
18
+ class AnalysisReport(BaseModel):
19
+ file_name: str
20
+ language: str
21
+ contract_type: str
22
+ final_risk_score: int
23
+ clauses: List[AnalyzedClause]
app/services/__init__.py ADDED
File without changes
app/services/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (176 Bytes). View file
 
app/services/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (151 Bytes). View file
 
app/services/__pycache__/preprocessor.cpython-311.pyc ADDED
Binary file (1.81 kB). View file
 
app/services/__pycache__/preprocessor.cpython-312.pyc ADDED
Binary file (1.46 kB). View file
 
app/services/__pycache__/risk_analyzer.cpython-311.pyc ADDED
Binary file (4.07 kB). View file
 
app/services/__pycache__/risk_analyzer.cpython-312.pyc ADDED
Binary file (3.82 kB). View file
 
app/services/__pycache__/risk_scorer.cpython-311.pyc ADDED
Binary file (1.99 kB). View file
 
app/services/__pycache__/risk_scorer.cpython-312.pyc ADDED
Binary file (1.84 kB). View file
 
app/services/__pycache__/text_extractor.cpython-311.pyc ADDED
Binary file (2.31 kB). View file
 
app/services/__pycache__/text_extractor.cpython-312.pyc ADDED
Binary file (2.03 kB). View file
 
app/services/preprocessor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+
5
+ def segment_into_clauses(full_text: str) -> List[str]:
6
+ """
7
+ Segment the full document text into individual clauses using regex patterns.
8
+ Looks for common clause patterns like "1.", "1.1", "(a)", etc.
9
+ """
10
+
11
+ # Define regex patterns for different clause formats
12
+ clause_patterns = [
13
+ r'\n\s*\d+\.\s+', # "1. ", "2. ", etc.
14
+ r'\n\s*\d+\.\d+\s+', # "1.1 ", "1.2 ", etc.
15
+ r'\n\s*\(\w+\)\s+', # "(a) ", "(b) ", etc.
16
+ r'\n\s*[ivx]+\.\s+', # "i. ", "ii. ", "iii. ", etc.
17
+ r'\n\s*[IVX]+\.\s+', # "I. ", "II. ", "III. ", etc.
18
+ r'\n\s*Article\s+\d+\s*:', # "Article 1:", "Article 2:", etc.
19
+ r'\n\s*Section\s+\d+\s*:', # "Section 1:", "Section 2:", etc.
20
+ r'\n\s*Clause\s+\d+\s*:', # "Clause 1:", "Clause 2:", etc.
21
+ ]
22
+
23
+ # Combine all patterns with OR operator
24
+ combined_pattern = '|'.join(clause_patterns)
25
+
26
+ # Split text using the combined pattern
27
+ clauses = re.split(combined_pattern, full_text)
28
+
29
+ # Clean up the clauses
30
+ cleaned_clauses = []
31
+ for clause in clauses:
32
+ clause = clause.strip()
33
+ if len(clause) > 50: # Only include substantial clauses
34
+ cleaned_clauses.append(clause)
35
+
36
+ # If no clauses were found with the patterns, try a simpler approach
37
+ if len(cleaned_clauses) <= 1:
38
+ # Split by double newlines or periods followed by newlines
39
+ simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text)
40
+ cleaned_clauses = [clause.strip()
41
+ for clause in simple_clauses if len(clause.strip()) > 50]
42
+
43
+ return cleaned_clauses
app/services/risk_analyzer.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from app.config import settings
3
+ from app.services.risk_scorer import RISK_DEFINITIONS
4
+ import json
5
+ import re
6
+ from typing import Dict, List
7
+
8
+
9
+ # Configure the Gemini API
10
+ genai.configure(api_key=settings.GEMINI_API_KEY)
11
+
12
+
13
+ def analyze_clause_with_gemini(clause_text: str) -> Dict:
14
+ """
15
+ Analyze a contract clause using Google Gemini AI for risk identification.
16
+
17
+ Args:
18
+ clause_text: The text of the clause to analyze
19
+
20
+ Returns:
21
+ Dictionary containing identified risk IDs and suggestions
22
+ """
23
+
24
+ # Create the detailed prompt for Gemini
25
+ prompt = f"""
26
+ You are an expert Indian legal consultant specializing in contract analysis and risk assessment.
27
+ Analyze the following contract clause and identify any legal risks based on the predefined risk categories.
28
+
29
+ CONTRACT CLAUSE TO ANALYZE:
30
+ {clause_text}
31
+
32
+ RISK CATEGORIES TO CHECK FOR:
33
+ 1. UNLIMITED_LIABILITY: Clause imposes unlimited liability on the client
34
+ 2. ONE_SIDED_TERMINATION: Termination rights are unfairly one-sided
35
+ 3. UNCLEAR_JURISDICTION: Governing law or jurisdiction for disputes is ambiguous
36
+ 4. DPDP_NON_COMPLIANCE: Data protection clause may not comply with the DPDP Act 2023
37
+
38
+ INSTRUCTIONS:
39
+ 1. Carefully read the clause text
40
+ 2. Identify which of the above risk categories apply to this clause
41
+ 3. For each identified risk, provide a brief explanation
42
+ 4. Suggest a compliant alternative or modification for any identified risks
43
+ 5. If no risks are found, respond with "No risks identified"
44
+
45
+ RESPONSE FORMAT (JSON):
46
+ {{
47
+ "risks": [
48
+ {{
49
+ "risk_id": "RISK_CATEGORY_ID",
50
+ "explanation": "Brief explanation of why this risk applies"
51
+ }}
52
+ ],
53
+ "suggestion": "Compliant alternative or modification suggestion"
54
+ }}
55
+
56
+ If no risks are identified, return:
57
+ {{
58
+ "risks": [],
59
+ "suggestion": "No risks identified - clause appears compliant"
60
+ }}
61
+ """
62
+
63
+ try:
64
+ # Initialize the Gemini model
65
+ model = genai.GenerativeModel('gemini-2.5-flash-lite')
66
+
67
+ # Generate response
68
+ response = model.generate_content(prompt)
69
+
70
+ # Extract the text response
71
+ response_text = response.text.strip()
72
+
73
+ # Try to parse JSON from the response
74
+ try:
75
+ # Look for JSON in the response (sometimes Gemini includes extra text)
76
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
77
+ if json_match:
78
+ json_str = json_match.group()
79
+ result = json.loads(json_str)
80
+ else:
81
+ # Fallback: try to parse the entire response as JSON
82
+ result = json.loads(response_text)
83
+ except json.JSONDecodeError:
84
+ # If JSON parsing fails, create a fallback response
85
+ result = {
86
+ "risks": [],
87
+ "suggestion": "Unable to parse AI response - manual review recommended"
88
+ }
89
+
90
+ # Validate and clean the response
91
+ if "risks" not in result:
92
+ result["risks"] = []
93
+ if "suggestion" not in result:
94
+ result["suggestion"] = "No suggestion provided"
95
+
96
+ # Validate risk IDs
97
+ valid_risks = []
98
+ for risk in result["risks"]:
99
+ if isinstance(risk, dict) and "risk_id" in risk:
100
+ risk_id = risk["risk_id"]
101
+ if risk_id in RISK_DEFINITIONS:
102
+ valid_risks.append(risk)
103
+
104
+ result["risks"] = valid_risks
105
+
106
+ return result
107
+
108
+ except Exception as e:
109
+ print(f"Error in Gemini analysis: {e}")
110
+ return {
111
+ "risks": [],
112
+ "suggestion": f"Analysis failed: {str(e)}"
113
+ }
app/services/risk_scorer.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from app.schemas import AnalyzedClause, RiskFinding
3
+
4
+
5
+ # Risk definitions as specified in the requirements
6
+ RISK_DEFINITIONS = {
7
+ "UNLIMITED_LIABILITY": {"score": 10, "description": "Clause imposes unlimited liability on the client."},
8
+ "ONE_SIDED_TERMINATION": {"score": 8, "description": "Termination rights are unfairly one-sided."},
9
+ "UNCLEAR_JURISDICTION": {"score": 6, "description": "Governing law or jurisdiction for disputes is ambiguous."},
10
+ "DPDP_NON_COMPLIANCE": {"score": 7, "description": "Data protection clause may not comply with the DPDP Act 2023."}
11
+ }
12
+
13
+
14
+ def calculate_scores(analyzed_clauses: List[AnalyzedClause]) -> Tuple[int, List[RiskFinding]]:
15
+ """
16
+ Calculate the total risk score and return detailed findings.
17
+
18
+ Args:
19
+ analyzed_clauses: List of analyzed clauses with identified risks
20
+
21
+ Returns:
22
+ Tuple of (final_risk_score, all_findings)
23
+ """
24
+
25
+ total_score = 0
26
+ all_findings = []
27
+
28
+ for clause in analyzed_clauses:
29
+ for risk in clause.risks:
30
+ # Add the risk finding to our collection
31
+ all_findings.append(risk)
32
+
33
+ # Add the score to our total
34
+ total_score += risk.score
35
+
36
+ return total_score, all_findings
37
+
38
+
39
+ def get_risk_definition(risk_id: str) -> dict:
40
+ """
41
+ Get risk definition by ID.
42
+
43
+ Args:
44
+ risk_id: The risk identifier
45
+
46
+ Returns:
47
+ Dictionary with score and description
48
+ """
49
+ return RISK_DEFINITIONS.get(risk_id, {"score": 0, "description": "Unknown risk"})
app/services/text_extractor.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import easyocr
3
+ from pdf2image import convert_from_path
4
+ from typing import Optional
5
+ import tempfile
6
+ import os
7
+
8
+
9
+ def extract_text_from_pdf(file_path: str) -> str:
10
+ """
11
+ Extract text from PDF using hybrid approach:
12
+ 1. First try PyMuPDF for searchable PDFs
13
+ 2. If minimal text, fall back to OCR for scanned PDFs
14
+ """
15
+
16
+ # Step 1: Try PyMuPDF extraction
17
+ try:
18
+ doc = fitz.open(file_path)
19
+ text = ""
20
+
21
+ for page_num in range(doc.page_count):
22
+ page = doc.load_page(page_num)
23
+ text += page.get_text()
24
+
25
+ doc.close()
26
+
27
+ # Check if we got meaningful text (more than 100 characters)
28
+ if len(text.strip()) > 100:
29
+ return text.strip()
30
+
31
+ except Exception as e:
32
+ print(f"PyMuPDF extraction failed: {e}")
33
+
34
+ # Step 2: Fall back to OCR for scanned PDFs
35
+ try:
36
+ # Convert PDF to images
37
+ images = convert_from_path(file_path)
38
+
39
+ # Initialize EasyOCR for English and Hindi
40
+ reader = easyocr.Reader(['en', 'hi'])
41
+
42
+ ocr_text = ""
43
+ for image in images:
44
+ # Perform OCR on each page
45
+ results = reader.readtext(image)
46
+
47
+ # Extract text from OCR results
48
+ for (bbox, text, confidence) in results:
49
+ if confidence > 0.5: # Only include high-confidence text
50
+ ocr_text += text + " "
51
+
52
+ return ocr_text.strip()
53
+
54
+ except Exception as e:
55
+ print(f"OCR extraction failed: {e}")
56
+ raise Exception(f"Failed to extract text from PDF: {e}")
57
+
58
+ return ""
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ pydantic==2.5.0
4
+ python-dotenv==1.0.0
5
+ PyMuPDF==1.23.8
6
+ pdf2image==1.16.3
7
+ easyocr==1.7.0
8
+ google-generativeai==0.3.2
9
+ python-multipart==0.0.6
10
+ Pillow==10.1.0