Spaces:

Rivalcoder
/

Smart-Contract-Analyzer

Sleeping

App Files Files Community

Rivalcoder commited on Sep 20, 2025

Commit

4b022af

1 Parent(s): a9ff187

Add Files

Browse files

Files changed (31) hide show

Dockerfile +58 -0
app/__init__.py +0 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/__init__.cpython-312.pyc +0 -0
app/__pycache__/config.cpython-311.pyc +0 -0
app/__pycache__/config.cpython-312.pyc +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
app/__pycache__/main.cpython-312.pyc +0 -0
app/__pycache__/schemas.cpython-311.pyc +0 -0
app/__pycache__/schemas.cpython-312.pyc +0 -0
app/config.py +28 -0
app/data/Basic-NDA.pdf +0 -0
app/data/Basic-Non-Disclosure-Agreement.pdf +0 -0
app/main.py +153 -0
app/schemas.py +23 -0
app/services/__init__.py +0 -0
app/services/__pycache__/__init__.cpython-311.pyc +0 -0
app/services/__pycache__/__init__.cpython-312.pyc +0 -0
app/services/__pycache__/preprocessor.cpython-311.pyc +0 -0
app/services/__pycache__/preprocessor.cpython-312.pyc +0 -0
app/services/__pycache__/risk_analyzer.cpython-311.pyc +0 -0
app/services/__pycache__/risk_analyzer.cpython-312.pyc +0 -0
app/services/__pycache__/risk_scorer.cpython-311.pyc +0 -0
app/services/__pycache__/risk_scorer.cpython-312.pyc +0 -0
app/services/__pycache__/text_extractor.cpython-311.pyc +0 -0
app/services/__pycache__/text_extractor.cpython-312.pyc +0 -0
app/services/preprocessor.py +43 -0
app/services/risk_analyzer.py +113 -0
app/services/risk_scorer.py +49 -0
app/services/text_extractor.py +58 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,58 @@

+# Use Python 3.11 slim image as base
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies required for PDF processing and OCR
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    tesseract-ocr-hin \
+    tesseract-ocr-ben \
+    tesseract-ocr-tam \
+    tesseract-ocr-tel \
+    tesseract-ocr-guj \
+    tesseract-ocr-mar \
+    tesseract-ocr-pan \
+    tesseract-ocr-ori \
+    tesseract-ocr-asm \
+    tesseract-ocr-mal \
+    tesseract-ocr-kan \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    libgcc-s1 \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app/ ./app/
+# Create a non-root user for security
+RUN useradd --create-home --shell /bin/bash app \
+    && chown -R app:app /app
+USER app
+# Expose port 8000
+EXPOSE 8000
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (167 Bytes). View file

app/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (142 Bytes). View file

app/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (843 Bytes). View file

app/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (704 Bytes). View file

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (6.65 kB). View file

app/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (5.72 kB). View file

app/__pycache__/schemas.cpython-311.pyc ADDED Viewed

Binary file (1.49 kB). View file

app/__pycache__/schemas.cpython-312.pyc ADDED Viewed

Binary file (1.19 kB). View file

app/config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file (if it exists)
+load_dotenv()
+# Get the Gemini API key from environment variables
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# Settings object for easy access
+class Settings:
+    GEMINI_API_KEY: str = GEMINI_API_KEY
+    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
+    MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760"))  # 10MB default
+    def validate(self):
+        """Validate required settings"""
+        if not self.GEMINI_API_KEY:
+            raise ValueError(
+                "GEMINI_API_KEY not found in environment variables. "
+                "Please set it in your environment or .env file."
+            )
+settings = Settings()
+# Validate settings on import (only if GEMINI_API_KEY is set)
+if GEMINI_API_KEY:
+    settings.validate()

app/data/Basic-NDA.pdf ADDED Viewed

Binary file (75.3 kB). View file

app/data/Basic-Non-Disclosure-Agreement.pdf ADDED Viewed

Binary file (71.7 kB). View file

app/main.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import tempfile
+import os
+from typing import List
+# Import our services
+from app.services.text_extractor import extract_text_from_pdf
+from app.services.preprocessor import segment_into_clauses
+from app.services.risk_analyzer import analyze_clause_with_gemini
+from app.services.risk_scorer import calculate_scores, get_risk_definition
+from app.schemas import AnalysisReport, AnalyzedClause, RiskFinding
+# Create FastAPI app instance
+app = FastAPI(
+    title="Multilingual Legal Contract Analyzer",
+    description="AI-powered contract analysis for English and Indic languages",
+    version="1.0.0"
+)
+@app.post("/analyze/", response_model=AnalysisReport)
+async def analyze_contract(file: UploadFile = File(...)):
+    """
+    Analyze a legal contract PDF and return detailed risk analysis.
+    Args:
+        file: PDF file to analyze
+    Returns:
+        AnalysisReport with risk analysis and suggestions
+    """
+    # Validate file type
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(
+            status_code=400, detail="Only PDF files are supported")
+    # Create temporary file to store uploaded PDF
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+        try:
+            # Write uploaded file to temporary file
+            content = await file.read()
+            temp_file.write(content)
+            temp_file.flush()
+            # Step 1: Extract text from PDF
+            print(f"Extracting text from {file.filename}...")
+            full_text = extract_text_from_pdf(temp_file.name)
+            if not full_text or len(full_text.strip()) < 50:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Unable to extract meaningful text from PDF. Please ensure the PDF is readable."
+                )
+            # Step 2: Segment text into clauses
+            print("Segmenting text into clauses...")
+            clauses = segment_into_clauses(full_text)
+            if not clauses:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Unable to identify contract clauses. Please ensure the document is a valid contract."
+                )
+            # Step 3: Analyze each clause with Gemini AI
+            print(f"Analyzing {len(clauses)} clauses with AI...")
+            analyzed_clauses = []
+            for i, clause_text in enumerate(clauses, 1):
+                print(f"Analyzing clause {i}/{len(clauses)}...")
+                # Get AI analysis
+                ai_result = analyze_clause_with_gemini(clause_text)
+                # Convert AI results to RiskFinding objects
+                risks = []
+                for risk_data in ai_result.get("risks", []):
+                    risk_id = risk_data.get("risk_id")
+                    if risk_id:
+                        risk_def = get_risk_definition(risk_id)
+                        risk_finding = RiskFinding(
+                            risk_id=risk_id,
+                            description=risk_data.get(
+                                "explanation", risk_def["description"]),
+                            score=risk_def["score"]
+                        )
+                        risks.append(risk_finding)
+                # Create AnalyzedClause object
+                analyzed_clause = AnalyzedClause(
+                    clause_number=i,
+                    # Truncate for response
+                    text=clause_text[:500] +
+                    "..." if len(clause_text) > 500 else clause_text,
+                    risks=risks,
+                    suggestion=ai_result.get("suggestion")
+                )
+                analyzed_clauses.append(analyzed_clause)
+            # Step 4: Calculate final risk score
+            print("Calculating final risk score...")
+            final_score, all_findings = calculate_scores(analyzed_clauses)
+            # Step 5: Determine contract type and language (basic detection)
+            contract_type = "General Contract"  # Could be enhanced with AI detection
+            language = "English"  # Could be enhanced with language detection
+            # Create final analysis report
+            analysis_report = AnalysisReport(
+                file_name=file.filename,
+                language=language,
+                contract_type=contract_type,
+                final_risk_score=final_score,
+                clauses=analyzed_clauses
+            )
+            print(f"Analysis complete. Final risk score: {final_score}")
+            return analysis_report
+        except HTTPException:
+            raise
+        except Exception as e:
+            print(f"Error during analysis: {e}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Analysis failed: {str(e)}"
+            )
+        finally:
+            # Clean up temporary file
+            try:
+                os.unlink(temp_file.name)
+            except:
+                pass
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {"message": "Multilingual Legal Contract Analyzer API is running"}
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "contract-analyzer"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app/schemas.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pydantic import BaseModel
+from typing import List, Optional
+class RiskFinding(BaseModel):
+    risk_id: str
+    description: str
+    score: int
+class AnalyzedClause(BaseModel):
+    clause_number: int
+    text: str
+    risks: List[RiskFinding]
+    suggestion: Optional[str] = None
+class AnalysisReport(BaseModel):
+    file_name: str
+    language: str
+    contract_type: str
+    final_risk_score: int
+    clauses: List[AnalyzedClause]

app/services/__init__.py ADDED Viewed

File without changes

app/services/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (176 Bytes). View file

app/services/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (151 Bytes). View file

app/services/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (1.81 kB). View file

app/services/__pycache__/preprocessor.cpython-312.pyc ADDED Viewed

Binary file (1.46 kB). View file

app/services/__pycache__/risk_analyzer.cpython-311.pyc ADDED Viewed

Binary file (4.07 kB). View file

app/services/__pycache__/risk_analyzer.cpython-312.pyc ADDED Viewed

Binary file (3.82 kB). View file

app/services/__pycache__/risk_scorer.cpython-311.pyc ADDED Viewed

Binary file (1.99 kB). View file

app/services/__pycache__/risk_scorer.cpython-312.pyc ADDED Viewed

Binary file (1.84 kB). View file

app/services/__pycache__/text_extractor.cpython-311.pyc ADDED Viewed

Binary file (2.31 kB). View file

app/services/__pycache__/text_extractor.cpython-312.pyc ADDED Viewed

Binary file (2.03 kB). View file

app/services/preprocessor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import re
+from typing import List
+def segment_into_clauses(full_text: str) -> List[str]:
+    """
+    Segment the full document text into individual clauses using regex patterns.
+    Looks for common clause patterns like "1.", "1.1", "(a)", etc.
+    """
+    # Define regex patterns for different clause formats
+    clause_patterns = [
+        r'\n\s*\d+\.\s+',  # "1. ", "2. ", etc.
+        r'\n\s*\d+\.\d+\s+',  # "1.1 ", "1.2 ", etc.
+        r'\n\s*\(\w+\)\s+',  # "(a) ", "(b) ", etc.
+        r'\n\s*[ivx]+\.\s+',  # "i. ", "ii. ", "iii. ", etc.
+        r'\n\s*[IVX]+\.\s+',  # "I. ", "II. ", "III. ", etc.
+        r'\n\s*Article\s+\d+\s*:',  # "Article 1:", "Article 2:", etc.
+        r'\n\s*Section\s+\d+\s*:',  # "Section 1:", "Section 2:", etc.
+        r'\n\s*Clause\s+\d+\s*:',  # "Clause 1:", "Clause 2:", etc.
+    ]
+    # Combine all patterns with OR operator
+    combined_pattern = '|'.join(clause_patterns)
+    # Split text using the combined pattern
+    clauses = re.split(combined_pattern, full_text)
+    # Clean up the clauses
+    cleaned_clauses = []
+    for clause in clauses:
+        clause = clause.strip()
+        if len(clause) > 50:  # Only include substantial clauses
+            cleaned_clauses.append(clause)
+    # If no clauses were found with the patterns, try a simpler approach
+    if len(cleaned_clauses) <= 1:
+        # Split by double newlines or periods followed by newlines
+        simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text)
+        cleaned_clauses = [clause.strip()
+                           for clause in simple_clauses if len(clause.strip()) > 50]
+    return cleaned_clauses

app/services/risk_analyzer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import google.generativeai as genai
+from app.config import settings
+from app.services.risk_scorer import RISK_DEFINITIONS
+import json
+import re
+from typing import Dict, List
+# Configure the Gemini API
+genai.configure(api_key=settings.GEMINI_API_KEY)
+def analyze_clause_with_gemini(clause_text: str) -> Dict:
+    """
+    Analyze a contract clause using Google Gemini AI for risk identification.
+    Args:
+        clause_text: The text of the clause to analyze
+    Returns:
+        Dictionary containing identified risk IDs and suggestions
+    """
+    # Create the detailed prompt for Gemini
+    prompt = f"""
+You are an expert Indian legal consultant specializing in contract analysis and risk assessment.
+Analyze the following contract clause and identify any legal risks based on the predefined risk categories.
+CONTRACT CLAUSE TO ANALYZE:
+{clause_text}
+RISK CATEGORIES TO CHECK FOR:
+1. UNLIMITED_LIABILITY: Clause imposes unlimited liability on the client
+2. ONE_SIDED_TERMINATION: Termination rights are unfairly one-sided
+3. UNCLEAR_JURISDICTION: Governing law or jurisdiction for disputes is ambiguous
+4. DPDP_NON_COMPLIANCE: Data protection clause may not comply with the DPDP Act 2023
+INSTRUCTIONS:
+1. Carefully read the clause text
+2. Identify which of the above risk categories apply to this clause
+3. For each identified risk, provide a brief explanation
+4. Suggest a compliant alternative or modification for any identified risks
+5. If no risks are found, respond with "No risks identified"
+RESPONSE FORMAT (JSON):
+{{
+    "risks": [
+        {{
+            "risk_id": "RISK_CATEGORY_ID",
+            "explanation": "Brief explanation of why this risk applies"
+        }}
+    ],
+    "suggestion": "Compliant alternative or modification suggestion"
+}}
+If no risks are identified, return:
+{{
+    "risks": [],
+    "suggestion": "No risks identified - clause appears compliant"
+}}
+"""
+    try:
+        # Initialize the Gemini model
+        model = genai.GenerativeModel('gemini-2.5-flash-lite')
+        # Generate response
+        response = model.generate_content(prompt)
+        # Extract the text response
+        response_text = response.text.strip()
+        # Try to parse JSON from the response
+        try:
+            # Look for JSON in the response (sometimes Gemini includes extra text)
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                json_str = json_match.group()
+                result = json.loads(json_str)
+            else:
+                # Fallback: try to parse the entire response as JSON
+                result = json.loads(response_text)
+        except json.JSONDecodeError:
+            # If JSON parsing fails, create a fallback response
+            result = {
+                "risks": [],
+                "suggestion": "Unable to parse AI response - manual review recommended"
+            }
+        # Validate and clean the response
+        if "risks" not in result:
+            result["risks"] = []
+        if "suggestion" not in result:
+            result["suggestion"] = "No suggestion provided"
+        # Validate risk IDs
+        valid_risks = []
+        for risk in result["risks"]:
+            if isinstance(risk, dict) and "risk_id" in risk:
+                risk_id = risk["risk_id"]
+                if risk_id in RISK_DEFINITIONS:
+                    valid_risks.append(risk)
+        result["risks"] = valid_risks
+        return result
+    except Exception as e:
+        print(f"Error in Gemini analysis: {e}")
+        return {
+            "risks": [],
+            "suggestion": f"Analysis failed: {str(e)}"
+        }

app/services/risk_scorer.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import List, Tuple
+from app.schemas import AnalyzedClause, RiskFinding
+# Risk definitions as specified in the requirements
+RISK_DEFINITIONS = {
+    "UNLIMITED_LIABILITY": {"score": 10, "description": "Clause imposes unlimited liability on the client."},
+    "ONE_SIDED_TERMINATION": {"score": 8, "description": "Termination rights are unfairly one-sided."},
+    "UNCLEAR_JURISDICTION": {"score": 6, "description": "Governing law or jurisdiction for disputes is ambiguous."},
+    "DPDP_NON_COMPLIANCE": {"score": 7, "description": "Data protection clause may not comply with the DPDP Act 2023."}
+}
+def calculate_scores(analyzed_clauses: List[AnalyzedClause]) -> Tuple[int, List[RiskFinding]]:
+    """
+    Calculate the total risk score and return detailed findings.
+    Args:
+        analyzed_clauses: List of analyzed clauses with identified risks
+    Returns:
+        Tuple of (final_risk_score, all_findings)
+    """
+    total_score = 0
+    all_findings = []
+    for clause in analyzed_clauses:
+        for risk in clause.risks:
+            # Add the risk finding to our collection
+            all_findings.append(risk)
+            # Add the score to our total
+            total_score += risk.score
+    return total_score, all_findings
+def get_risk_definition(risk_id: str) -> dict:
+    """
+    Get risk definition by ID.
+    Args:
+        risk_id: The risk identifier
+    Returns:
+        Dictionary with score and description
+    """
+    return RISK_DEFINITIONS.get(risk_id, {"score": 0, "description": "Unknown risk"})

app/services/text_extractor.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import fitz  # PyMuPDF
+import easyocr
+from pdf2image import convert_from_path
+from typing import Optional
+import tempfile
+import os
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extract text from PDF using hybrid approach:
+    1. First try PyMuPDF for searchable PDFs
+    2. If minimal text, fall back to OCR for scanned PDFs
+    """
+    # Step 1: Try PyMuPDF extraction
+    try:
+        doc = fitz.open(file_path)
+        text = ""
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+        doc.close()
+        # Check if we got meaningful text (more than 100 characters)
+        if len(text.strip()) > 100:
+            return text.strip()
+    except Exception as e:
+        print(f"PyMuPDF extraction failed: {e}")
+    # Step 2: Fall back to OCR for scanned PDFs
+    try:
+        # Convert PDF to images
+        images = convert_from_path(file_path)
+        # Initialize EasyOCR for English and Hindi
+        reader = easyocr.Reader(['en', 'hi'])
+        ocr_text = ""
+        for image in images:
+            # Perform OCR on each page
+            results = reader.readtext(image)
+            # Extract text from OCR results
+            for (bbox, text, confidence) in results:
+                if confidence > 0.5:  # Only include high-confidence text
+                    ocr_text += text + " "
+        return ocr_text.strip()
+    except Exception as e:
+        print(f"OCR extraction failed: {e}")
+        raise Exception(f"Failed to extract text from PDF: {e}")
+    return ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+python-dotenv==1.0.0
+PyMuPDF==1.23.8
+pdf2image==1.16.3
+easyocr==1.7.0
+google-generativeai==0.3.2
+python-multipart==0.0.6
+Pillow==10.1.0