Rivalcoder
commited on
Commit
·
4b022af
1
Parent(s):
a9ff187
Add Files
Browse files- Dockerfile +58 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/__pycache__/config.cpython-311.pyc +0 -0
- app/__pycache__/config.cpython-312.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-312.pyc +0 -0
- app/__pycache__/schemas.cpython-311.pyc +0 -0
- app/__pycache__/schemas.cpython-312.pyc +0 -0
- app/config.py +28 -0
- app/data/Basic-NDA.pdf +0 -0
- app/data/Basic-Non-Disclosure-Agreement.pdf +0 -0
- app/main.py +153 -0
- app/schemas.py +23 -0
- app/services/__init__.py +0 -0
- app/services/__pycache__/__init__.cpython-311.pyc +0 -0
- app/services/__pycache__/__init__.cpython-312.pyc +0 -0
- app/services/__pycache__/preprocessor.cpython-311.pyc +0 -0
- app/services/__pycache__/preprocessor.cpython-312.pyc +0 -0
- app/services/__pycache__/risk_analyzer.cpython-311.pyc +0 -0
- app/services/__pycache__/risk_analyzer.cpython-312.pyc +0 -0
- app/services/__pycache__/risk_scorer.cpython-311.pyc +0 -0
- app/services/__pycache__/risk_scorer.cpython-312.pyc +0 -0
- app/services/__pycache__/text_extractor.cpython-311.pyc +0 -0
- app/services/__pycache__/text_extractor.cpython-312.pyc +0 -0
- app/services/preprocessor.py +43 -0
- app/services/risk_analyzer.py +113 -0
- app/services/risk_scorer.py +49 -0
- app/services/text_extractor.py +58 -0
- requirements.txt +10 -0
Dockerfile
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11 slim image as base
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies required for PDF processing and OCR
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
tesseract-ocr \
|
| 10 |
+
tesseract-ocr-eng \
|
| 11 |
+
tesseract-ocr-hin \
|
| 12 |
+
tesseract-ocr-ben \
|
| 13 |
+
tesseract-ocr-tam \
|
| 14 |
+
tesseract-ocr-tel \
|
| 15 |
+
tesseract-ocr-guj \
|
| 16 |
+
tesseract-ocr-mar \
|
| 17 |
+
tesseract-ocr-pan \
|
| 18 |
+
tesseract-ocr-ori \
|
| 19 |
+
tesseract-ocr-asm \
|
| 20 |
+
tesseract-ocr-mal \
|
| 21 |
+
tesseract-ocr-kan \
|
| 22 |
+
libgl1-mesa-glx \
|
| 23 |
+
libglib2.0-0 \
|
| 24 |
+
libsm6 \
|
| 25 |
+
libxext6 \
|
| 26 |
+
libxrender-dev \
|
| 27 |
+
libgomp1 \
|
| 28 |
+
libgcc-s1 \
|
| 29 |
+
poppler-utils \
|
| 30 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
+
|
| 32 |
+
# Copy requirements first for better Docker layer caching
|
| 33 |
+
COPY requirements.txt .
|
| 34 |
+
|
| 35 |
+
# Install Python dependencies
|
| 36 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 37 |
+
|
| 38 |
+
# Copy application code
|
| 39 |
+
COPY app/ ./app/
|
| 40 |
+
|
| 41 |
+
# Create a non-root user for security
|
| 42 |
+
RUN useradd --create-home --shell /bin/bash app \
|
| 43 |
+
&& chown -R app:app /app
|
| 44 |
+
USER app
|
| 45 |
+
|
| 46 |
+
# Expose port 8000
|
| 47 |
+
EXPOSE 8000
|
| 48 |
+
|
| 49 |
+
# Set environment variables
|
| 50 |
+
ENV PYTHONPATH=/app
|
| 51 |
+
ENV PYTHONUNBUFFERED=1
|
| 52 |
+
|
| 53 |
+
# Health check
|
| 54 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 55 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 56 |
+
|
| 57 |
+
# Run the application
|
| 58 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (167 Bytes). View file
|
|
|
app/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (142 Bytes). View file
|
|
|
app/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (843 Bytes). View file
|
|
|
app/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (704 Bytes). View file
|
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (6.65 kB). View file
|
|
|
app/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (5.72 kB). View file
|
|
|
app/__pycache__/schemas.cpython-311.pyc
ADDED
|
Binary file (1.49 kB). View file
|
|
|
app/__pycache__/schemas.cpython-312.pyc
ADDED
|
Binary file (1.19 kB). View file
|
|
|
app/config.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Load environment variables from .env file (if it exists)
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# Get the Gemini API key from environment variables
|
| 8 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 9 |
+
|
| 10 |
+
# Settings object for easy access
|
| 11 |
+
class Settings:
|
| 12 |
+
GEMINI_API_KEY: str = GEMINI_API_KEY
|
| 13 |
+
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
| 14 |
+
MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB default
|
| 15 |
+
|
| 16 |
+
def validate(self):
|
| 17 |
+
"""Validate required settings"""
|
| 18 |
+
if not self.GEMINI_API_KEY:
|
| 19 |
+
raise ValueError(
|
| 20 |
+
"GEMINI_API_KEY not found in environment variables. "
|
| 21 |
+
"Please set it in your environment or .env file."
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
settings = Settings()
|
| 25 |
+
|
| 26 |
+
# Validate settings on import (only if GEMINI_API_KEY is set)
|
| 27 |
+
if GEMINI_API_KEY:
|
| 28 |
+
settings.validate()
|
app/data/Basic-NDA.pdf
ADDED
|
Binary file (75.3 kB). View file
|
|
|
app/data/Basic-Non-Disclosure-Agreement.pdf
ADDED
|
Binary file (71.7 kB). View file
|
|
|
app/main.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
import tempfile
|
| 4 |
+
import os
|
| 5 |
+
from typing import List
|
| 6 |
+
|
| 7 |
+
# Import our services
|
| 8 |
+
from app.services.text_extractor import extract_text_from_pdf
|
| 9 |
+
from app.services.preprocessor import segment_into_clauses
|
| 10 |
+
from app.services.risk_analyzer import analyze_clause_with_gemini
|
| 11 |
+
from app.services.risk_scorer import calculate_scores, get_risk_definition
|
| 12 |
+
|
| 13 |
+
from app.schemas import AnalysisReport, AnalyzedClause, RiskFinding
|
| 14 |
+
|
| 15 |
+
# Create FastAPI app instance
|
| 16 |
+
app = FastAPI(
|
| 17 |
+
title="Multilingual Legal Contract Analyzer",
|
| 18 |
+
description="AI-powered contract analysis for English and Indic languages",
|
| 19 |
+
version="1.0.0"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@app.post("/analyze/", response_model=AnalysisReport)
|
| 24 |
+
async def analyze_contract(file: UploadFile = File(...)):
|
| 25 |
+
"""
|
| 26 |
+
Analyze a legal contract PDF and return detailed risk analysis.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
file: PDF file to analyze
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
AnalysisReport with risk analysis and suggestions
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
# Validate file type
|
| 36 |
+
if not file.filename.lower().endswith('.pdf'):
|
| 37 |
+
raise HTTPException(
|
| 38 |
+
status_code=400, detail="Only PDF files are supported")
|
| 39 |
+
|
| 40 |
+
# Create temporary file to store uploaded PDF
|
| 41 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 42 |
+
try:
|
| 43 |
+
# Write uploaded file to temporary file
|
| 44 |
+
content = await file.read()
|
| 45 |
+
temp_file.write(content)
|
| 46 |
+
temp_file.flush()
|
| 47 |
+
|
| 48 |
+
# Step 1: Extract text from PDF
|
| 49 |
+
print(f"Extracting text from {file.filename}...")
|
| 50 |
+
full_text = extract_text_from_pdf(temp_file.name)
|
| 51 |
+
|
| 52 |
+
if not full_text or len(full_text.strip()) < 50:
|
| 53 |
+
raise HTTPException(
|
| 54 |
+
status_code=400,
|
| 55 |
+
detail="Unable to extract meaningful text from PDF. Please ensure the PDF is readable."
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Step 2: Segment text into clauses
|
| 59 |
+
print("Segmenting text into clauses...")
|
| 60 |
+
clauses = segment_into_clauses(full_text)
|
| 61 |
+
|
| 62 |
+
if not clauses:
|
| 63 |
+
raise HTTPException(
|
| 64 |
+
status_code=400,
|
| 65 |
+
detail="Unable to identify contract clauses. Please ensure the document is a valid contract."
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Step 3: Analyze each clause with Gemini AI
|
| 69 |
+
print(f"Analyzing {len(clauses)} clauses with AI...")
|
| 70 |
+
analyzed_clauses = []
|
| 71 |
+
|
| 72 |
+
for i, clause_text in enumerate(clauses, 1):
|
| 73 |
+
print(f"Analyzing clause {i}/{len(clauses)}...")
|
| 74 |
+
|
| 75 |
+
# Get AI analysis
|
| 76 |
+
ai_result = analyze_clause_with_gemini(clause_text)
|
| 77 |
+
|
| 78 |
+
# Convert AI results to RiskFinding objects
|
| 79 |
+
risks = []
|
| 80 |
+
for risk_data in ai_result.get("risks", []):
|
| 81 |
+
risk_id = risk_data.get("risk_id")
|
| 82 |
+
if risk_id:
|
| 83 |
+
risk_def = get_risk_definition(risk_id)
|
| 84 |
+
risk_finding = RiskFinding(
|
| 85 |
+
risk_id=risk_id,
|
| 86 |
+
description=risk_data.get(
|
| 87 |
+
"explanation", risk_def["description"]),
|
| 88 |
+
score=risk_def["score"]
|
| 89 |
+
)
|
| 90 |
+
risks.append(risk_finding)
|
| 91 |
+
|
| 92 |
+
# Create AnalyzedClause object
|
| 93 |
+
analyzed_clause = AnalyzedClause(
|
| 94 |
+
clause_number=i,
|
| 95 |
+
# Truncate for response
|
| 96 |
+
text=clause_text[:500] +
|
| 97 |
+
"..." if len(clause_text) > 500 else clause_text,
|
| 98 |
+
risks=risks,
|
| 99 |
+
suggestion=ai_result.get("suggestion")
|
| 100 |
+
)
|
| 101 |
+
analyzed_clauses.append(analyzed_clause)
|
| 102 |
+
|
| 103 |
+
# Step 4: Calculate final risk score
|
| 104 |
+
print("Calculating final risk score...")
|
| 105 |
+
final_score, all_findings = calculate_scores(analyzed_clauses)
|
| 106 |
+
|
| 107 |
+
# Step 5: Determine contract type and language (basic detection)
|
| 108 |
+
contract_type = "General Contract" # Could be enhanced with AI detection
|
| 109 |
+
language = "English" # Could be enhanced with language detection
|
| 110 |
+
|
| 111 |
+
# Create final analysis report
|
| 112 |
+
analysis_report = AnalysisReport(
|
| 113 |
+
file_name=file.filename,
|
| 114 |
+
language=language,
|
| 115 |
+
contract_type=contract_type,
|
| 116 |
+
final_risk_score=final_score,
|
| 117 |
+
clauses=analyzed_clauses
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
print(f"Analysis complete. Final risk score: {final_score}")
|
| 121 |
+
return analysis_report
|
| 122 |
+
|
| 123 |
+
except HTTPException:
|
| 124 |
+
raise
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"Error during analysis: {e}")
|
| 127 |
+
raise HTTPException(
|
| 128 |
+
status_code=500,
|
| 129 |
+
detail=f"Analysis failed: {str(e)}"
|
| 130 |
+
)
|
| 131 |
+
finally:
|
| 132 |
+
# Clean up temporary file
|
| 133 |
+
try:
|
| 134 |
+
os.unlink(temp_file.name)
|
| 135 |
+
except:
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@app.get("/")
|
| 140 |
+
async def root():
|
| 141 |
+
"""Health check endpoint"""
|
| 142 |
+
return {"message": "Multilingual Legal Contract Analyzer API is running"}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@app.get("/health")
|
| 146 |
+
async def health_check():
|
| 147 |
+
"""Health check endpoint"""
|
| 148 |
+
return {"status": "healthy", "service": "contract-analyzer"}
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
import uvicorn
|
| 153 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/schemas.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class RiskFinding(BaseModel):
|
| 6 |
+
risk_id: str
|
| 7 |
+
description: str
|
| 8 |
+
score: int
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AnalyzedClause(BaseModel):
|
| 12 |
+
clause_number: int
|
| 13 |
+
text: str
|
| 14 |
+
risks: List[RiskFinding]
|
| 15 |
+
suggestion: Optional[str] = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class AnalysisReport(BaseModel):
|
| 19 |
+
file_name: str
|
| 20 |
+
language: str
|
| 21 |
+
contract_type: str
|
| 22 |
+
final_risk_score: int
|
| 23 |
+
clauses: List[AnalyzedClause]
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (176 Bytes). View file
|
|
|
app/services/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
app/services/__pycache__/preprocessor.cpython-311.pyc
ADDED
|
Binary file (1.81 kB). View file
|
|
|
app/services/__pycache__/preprocessor.cpython-312.pyc
ADDED
|
Binary file (1.46 kB). View file
|
|
|
app/services/__pycache__/risk_analyzer.cpython-311.pyc
ADDED
|
Binary file (4.07 kB). View file
|
|
|
app/services/__pycache__/risk_analyzer.cpython-312.pyc
ADDED
|
Binary file (3.82 kB). View file
|
|
|
app/services/__pycache__/risk_scorer.cpython-311.pyc
ADDED
|
Binary file (1.99 kB). View file
|
|
|
app/services/__pycache__/risk_scorer.cpython-312.pyc
ADDED
|
Binary file (1.84 kB). View file
|
|
|
app/services/__pycache__/text_extractor.cpython-311.pyc
ADDED
|
Binary file (2.31 kB). View file
|
|
|
app/services/__pycache__/text_extractor.cpython-312.pyc
ADDED
|
Binary file (2.03 kB). View file
|
|
|
app/services/preprocessor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def segment_into_clauses(full_text: str) -> List[str]:
|
| 6 |
+
"""
|
| 7 |
+
Segment the full document text into individual clauses using regex patterns.
|
| 8 |
+
Looks for common clause patterns like "1.", "1.1", "(a)", etc.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
# Define regex patterns for different clause formats
|
| 12 |
+
clause_patterns = [
|
| 13 |
+
r'\n\s*\d+\.\s+', # "1. ", "2. ", etc.
|
| 14 |
+
r'\n\s*\d+\.\d+\s+', # "1.1 ", "1.2 ", etc.
|
| 15 |
+
r'\n\s*\(\w+\)\s+', # "(a) ", "(b) ", etc.
|
| 16 |
+
r'\n\s*[ivx]+\.\s+', # "i. ", "ii. ", "iii. ", etc.
|
| 17 |
+
r'\n\s*[IVX]+\.\s+', # "I. ", "II. ", "III. ", etc.
|
| 18 |
+
r'\n\s*Article\s+\d+\s*:', # "Article 1:", "Article 2:", etc.
|
| 19 |
+
r'\n\s*Section\s+\d+\s*:', # "Section 1:", "Section 2:", etc.
|
| 20 |
+
r'\n\s*Clause\s+\d+\s*:', # "Clause 1:", "Clause 2:", etc.
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
# Combine all patterns with OR operator
|
| 24 |
+
combined_pattern = '|'.join(clause_patterns)
|
| 25 |
+
|
| 26 |
+
# Split text using the combined pattern
|
| 27 |
+
clauses = re.split(combined_pattern, full_text)
|
| 28 |
+
|
| 29 |
+
# Clean up the clauses
|
| 30 |
+
cleaned_clauses = []
|
| 31 |
+
for clause in clauses:
|
| 32 |
+
clause = clause.strip()
|
| 33 |
+
if len(clause) > 50: # Only include substantial clauses
|
| 34 |
+
cleaned_clauses.append(clause)
|
| 35 |
+
|
| 36 |
+
# If no clauses were found with the patterns, try a simpler approach
|
| 37 |
+
if len(cleaned_clauses) <= 1:
|
| 38 |
+
# Split by double newlines or periods followed by newlines
|
| 39 |
+
simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text)
|
| 40 |
+
cleaned_clauses = [clause.strip()
|
| 41 |
+
for clause in simple_clauses if len(clause.strip()) > 50]
|
| 42 |
+
|
| 43 |
+
return cleaned_clauses
|
app/services/risk_analyzer.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from app.config import settings
|
| 3 |
+
from app.services.risk_scorer import RISK_DEFINITIONS
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
from typing import Dict, List
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Configure the Gemini API
|
| 10 |
+
genai.configure(api_key=settings.GEMINI_API_KEY)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def analyze_clause_with_gemini(clause_text: str) -> Dict:
|
| 14 |
+
"""
|
| 15 |
+
Analyze a contract clause using Google Gemini AI for risk identification.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
clause_text: The text of the clause to analyze
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
Dictionary containing identified risk IDs and suggestions
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
# Create the detailed prompt for Gemini
|
| 25 |
+
prompt = f"""
|
| 26 |
+
You are an expert Indian legal consultant specializing in contract analysis and risk assessment.
|
| 27 |
+
Analyze the following contract clause and identify any legal risks based on the predefined risk categories.
|
| 28 |
+
|
| 29 |
+
CONTRACT CLAUSE TO ANALYZE:
|
| 30 |
+
{clause_text}
|
| 31 |
+
|
| 32 |
+
RISK CATEGORIES TO CHECK FOR:
|
| 33 |
+
1. UNLIMITED_LIABILITY: Clause imposes unlimited liability on the client
|
| 34 |
+
2. ONE_SIDED_TERMINATION: Termination rights are unfairly one-sided
|
| 35 |
+
3. UNCLEAR_JURISDICTION: Governing law or jurisdiction for disputes is ambiguous
|
| 36 |
+
4. DPDP_NON_COMPLIANCE: Data protection clause may not comply with the DPDP Act 2023
|
| 37 |
+
|
| 38 |
+
INSTRUCTIONS:
|
| 39 |
+
1. Carefully read the clause text
|
| 40 |
+
2. Identify which of the above risk categories apply to this clause
|
| 41 |
+
3. For each identified risk, provide a brief explanation
|
| 42 |
+
4. Suggest a compliant alternative or modification for any identified risks
|
| 43 |
+
5. If no risks are found, respond with "No risks identified"
|
| 44 |
+
|
| 45 |
+
RESPONSE FORMAT (JSON):
|
| 46 |
+
{{
|
| 47 |
+
"risks": [
|
| 48 |
+
{{
|
| 49 |
+
"risk_id": "RISK_CATEGORY_ID",
|
| 50 |
+
"explanation": "Brief explanation of why this risk applies"
|
| 51 |
+
}}
|
| 52 |
+
],
|
| 53 |
+
"suggestion": "Compliant alternative or modification suggestion"
|
| 54 |
+
}}
|
| 55 |
+
|
| 56 |
+
If no risks are identified, return:
|
| 57 |
+
{{
|
| 58 |
+
"risks": [],
|
| 59 |
+
"suggestion": "No risks identified - clause appears compliant"
|
| 60 |
+
}}
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# Initialize the Gemini model
|
| 65 |
+
model = genai.GenerativeModel('gemini-2.5-flash-lite')
|
| 66 |
+
|
| 67 |
+
# Generate response
|
| 68 |
+
response = model.generate_content(prompt)
|
| 69 |
+
|
| 70 |
+
# Extract the text response
|
| 71 |
+
response_text = response.text.strip()
|
| 72 |
+
|
| 73 |
+
# Try to parse JSON from the response
|
| 74 |
+
try:
|
| 75 |
+
# Look for JSON in the response (sometimes Gemini includes extra text)
|
| 76 |
+
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
| 77 |
+
if json_match:
|
| 78 |
+
json_str = json_match.group()
|
| 79 |
+
result = json.loads(json_str)
|
| 80 |
+
else:
|
| 81 |
+
# Fallback: try to parse the entire response as JSON
|
| 82 |
+
result = json.loads(response_text)
|
| 83 |
+
except json.JSONDecodeError:
|
| 84 |
+
# If JSON parsing fails, create a fallback response
|
| 85 |
+
result = {
|
| 86 |
+
"risks": [],
|
| 87 |
+
"suggestion": "Unable to parse AI response - manual review recommended"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# Validate and clean the response
|
| 91 |
+
if "risks" not in result:
|
| 92 |
+
result["risks"] = []
|
| 93 |
+
if "suggestion" not in result:
|
| 94 |
+
result["suggestion"] = "No suggestion provided"
|
| 95 |
+
|
| 96 |
+
# Validate risk IDs
|
| 97 |
+
valid_risks = []
|
| 98 |
+
for risk in result["risks"]:
|
| 99 |
+
if isinstance(risk, dict) and "risk_id" in risk:
|
| 100 |
+
risk_id = risk["risk_id"]
|
| 101 |
+
if risk_id in RISK_DEFINITIONS:
|
| 102 |
+
valid_risks.append(risk)
|
| 103 |
+
|
| 104 |
+
result["risks"] = valid_risks
|
| 105 |
+
|
| 106 |
+
return result
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"Error in Gemini analysis: {e}")
|
| 110 |
+
return {
|
| 111 |
+
"risks": [],
|
| 112 |
+
"suggestion": f"Analysis failed: {str(e)}"
|
| 113 |
+
}
|
app/services/risk_scorer.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
from app.schemas import AnalyzedClause, RiskFinding
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
# Risk definitions as specified in the requirements
|
| 6 |
+
RISK_DEFINITIONS = {
|
| 7 |
+
"UNLIMITED_LIABILITY": {"score": 10, "description": "Clause imposes unlimited liability on the client."},
|
| 8 |
+
"ONE_SIDED_TERMINATION": {"score": 8, "description": "Termination rights are unfairly one-sided."},
|
| 9 |
+
"UNCLEAR_JURISDICTION": {"score": 6, "description": "Governing law or jurisdiction for disputes is ambiguous."},
|
| 10 |
+
"DPDP_NON_COMPLIANCE": {"score": 7, "description": "Data protection clause may not comply with the DPDP Act 2023."}
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def calculate_scores(analyzed_clauses: List[AnalyzedClause]) -> Tuple[int, List[RiskFinding]]:
|
| 15 |
+
"""
|
| 16 |
+
Calculate the total risk score and return detailed findings.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
analyzed_clauses: List of analyzed clauses with identified risks
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Tuple of (final_risk_score, all_findings)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
total_score = 0
|
| 26 |
+
all_findings = []
|
| 27 |
+
|
| 28 |
+
for clause in analyzed_clauses:
|
| 29 |
+
for risk in clause.risks:
|
| 30 |
+
# Add the risk finding to our collection
|
| 31 |
+
all_findings.append(risk)
|
| 32 |
+
|
| 33 |
+
# Add the score to our total
|
| 34 |
+
total_score += risk.score
|
| 35 |
+
|
| 36 |
+
return total_score, all_findings
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_risk_definition(risk_id: str) -> dict:
|
| 40 |
+
"""
|
| 41 |
+
Get risk definition by ID.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
risk_id: The risk identifier
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Dictionary with score and description
|
| 48 |
+
"""
|
| 49 |
+
return RISK_DEFINITIONS.get(risk_id, {"score": 0, "description": "Unknown risk"})
|
app/services/text_extractor.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import easyocr
|
| 3 |
+
from pdf2image import convert_from_path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import tempfile
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def extract_text_from_pdf(file_path: str) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Extract text from PDF using hybrid approach:
|
| 12 |
+
1. First try PyMuPDF for searchable PDFs
|
| 13 |
+
2. If minimal text, fall back to OCR for scanned PDFs
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
# Step 1: Try PyMuPDF extraction
|
| 17 |
+
try:
|
| 18 |
+
doc = fitz.open(file_path)
|
| 19 |
+
text = ""
|
| 20 |
+
|
| 21 |
+
for page_num in range(doc.page_count):
|
| 22 |
+
page = doc.load_page(page_num)
|
| 23 |
+
text += page.get_text()
|
| 24 |
+
|
| 25 |
+
doc.close()
|
| 26 |
+
|
| 27 |
+
# Check if we got meaningful text (more than 100 characters)
|
| 28 |
+
if len(text.strip()) > 100:
|
| 29 |
+
return text.strip()
|
| 30 |
+
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"PyMuPDF extraction failed: {e}")
|
| 33 |
+
|
| 34 |
+
# Step 2: Fall back to OCR for scanned PDFs
|
| 35 |
+
try:
|
| 36 |
+
# Convert PDF to images
|
| 37 |
+
images = convert_from_path(file_path)
|
| 38 |
+
|
| 39 |
+
# Initialize EasyOCR for English and Hindi
|
| 40 |
+
reader = easyocr.Reader(['en', 'hi'])
|
| 41 |
+
|
| 42 |
+
ocr_text = ""
|
| 43 |
+
for image in images:
|
| 44 |
+
# Perform OCR on each page
|
| 45 |
+
results = reader.readtext(image)
|
| 46 |
+
|
| 47 |
+
# Extract text from OCR results
|
| 48 |
+
for (bbox, text, confidence) in results:
|
| 49 |
+
if confidence > 0.5: # Only include high-confidence text
|
| 50 |
+
ocr_text += text + " "
|
| 51 |
+
|
| 52 |
+
return ocr_text.strip()
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"OCR extraction failed: {e}")
|
| 56 |
+
raise Exception(f"Failed to extract text from PDF: {e}")
|
| 57 |
+
|
| 58 |
+
return ""
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn[standard]==0.24.0
|
| 3 |
+
pydantic==2.5.0
|
| 4 |
+
python-dotenv==1.0.0
|
| 5 |
+
PyMuPDF==1.23.8
|
| 6 |
+
pdf2image==1.16.3
|
| 7 |
+
easyocr==1.7.0
|
| 8 |
+
google-generativeai==0.3.2
|
| 9 |
+
python-multipart==0.0.6
|
| 10 |
+
Pillow==10.1.0
|