Spaces:

Nilyzz
/

clausewatch-api

Sleeping

App Files Files Community

Nilyzz commited on Jan 11

Commit

306e475

1 Parent(s): 12091ef

Add files

Browse files

Files changed (12) hide show

Dockerfile +19 -0
app/core/__pycache__/database.cpython-312.pyc +0 -0
app/core/database.py +22 -0
app/models/__pycache__/sql_models.cpython-312.pyc +0 -0
app/models/sql_models.py +13 -0
app/services/__pycache__/nlp_engine.cpython-312.pyc +0 -0
app/services/__pycache__/vector_store.cpython-312.pyc +0 -0
app/services/nlp_engine.py +98 -0
app/services/vector_store.py +71 -0
clausewatch.db +0 -0
main.py +333 -0
requirements.txt +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

app/core/__pycache__/database.cpython-312.pyc ADDED Viewed

Binary file (870 Bytes). View file

app/core/database.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+# SQLite database file in the project root
+SQLALCHEMY_DATABASE_URL = "sqlite:///./clausewatch.db"
+engine = create_engine(
+    SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+# Dependency to get DB session in endpoints
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()

app/models/__pycache__/sql_models.cpython-312.pyc ADDED Viewed

Binary file (918 Bytes). View file

app/models/sql_models.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from sqlalchemy import Column, Integer, String, Float, DateTime
+from datetime import datetime
+from app.core.database import Base
+class AnalysisRecord(Base):
+    __tablename__ = "analysis_history"
+    id = Column(Integer, primary_key=True, index=True)
+    filename = Column(String, index=True)
+    upload_date = Column(DateTime, default=datetime.utcnow)
+    risk_score = Column(Integer)
+    total_clauses = Column(Integer)
+    risky_clauses = Column(Integer)

app/services/__pycache__/nlp_engine.cpython-312.pyc ADDED Viewed

Binary file (4.21 kB). View file

app/services/__pycache__/vector_store.cpython-312.pyc ADDED Viewed

Binary file (3.77 kB). View file

app/services/nlp_engine.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch.nn.functional as F
+class LegalNLPEngine:
+    def __init__(self):
+        self.model_name = "nlpaueb/legal-bert-base-uncased"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading NLP Model: {self.model_name} on {self.device}...")
+        # 1. TOKENIZER: Converts text to numbers
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        # 2. MODEL: The neural network
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=2)
+        self.model.to(self.device)
+        self.model.eval()
+    def analyze_clause(self, text: str):
+        if not text or len(text) < 10:
+            return None
+        # --- Rules heuristics ---
+        text_lower = text.lower()
+        risky_keywords = [
+            "modificación unilateral", "exención total de responsabilidad",
+            "venta de datos", "renuncia a derechos", "demandas colectivas",
+            "arbitraje privado", "sin previo aviso", "no se hace responsable",
+            "derecho irrevocable", "renunciando a la jurisdicción",
+            "indemnización", "sin compensación", "datos a terceros"
+        ]
+        safe_keywords = [
+            "horario", "jornada", "fecha", "nombre", "domicilio",
+            "dni", "firmado", "en prueba", "convenio", "trabajador",
+            "vacaciones", "nómina", "seguridad social", "protección de datos",
+            "anexo", "contrato", "acuerdo", "estipulaciones", "cláusula",
+            "firmando", "lugar y fecha", "reunidos"
+        ]
+        if any(k in text_lower for k in risky_keywords):
+            return {
+                "text_snippet": text[:100] + "...",
+                "label": "POTENTIAL_RISK",
+                "confidence": 0.95,
+                "is_risky": True
+            }
+        if any(k in text_lower for k in safe_keywords):
+            return {
+                "text_snippet": text[:100] + "...",
+                "label": "ACCEPTABLE",
+                "confidence": 0.90,
+                "is_risky": False
+            }
+        # ---IA BERT ---
+        try:
+            # Tokenization
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True
+            ).to(self.device)
+            # Inference (Pass through the neural network)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+            probs = F.softmax(outputs.logits, dim=1)
+            risk_score = probs[0][1].item()
+            is_risky_ai = risk_score > 0.55
+            return {
+                "text_snippet": text[:100] + "...",
+                "label": "AI_DETECTED_RISK" if is_risky_ai else "AI_CLEARED",
+                "confidence": round(float(max(probs[0])), 2),
+                "is_risky": is_risky_ai
+            }
+        except Exception as e:
+            # Fallback
+            return {
+                "text_snippet": text[:100] + "...",
+                "label": "NEUTRAL",
+                "confidence": 0.0,
+                "is_risky": False
+            }
+# Singleton instance
+nlp_engine = LegalNLPEngine()

app/services/vector_store.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import google.generativeai as genai
+import numpy as np
+import os
+class InMemoryVectorStore:
+    def __init__(self):
+        self.store = {}
+        self.model_name = "models/text-embedding-004"
+    def get_embedding(self, text):
+        try:
+            result = genai.embed_content(
+                model=self.model_name,
+                content=text,
+                task_type="retrieval_document"
+            )
+            return result['embedding']
+        except Exception as e:
+            print(f"Error getting embedding: {e}")
+            return []
+    def add_contract(self, filename: str, chunks: list):
+        print(f"Indexing {filename} using Google Embeddings...")
+        self.store[filename] = []
+        for chunk in chunks:
+            text = chunk["text"]
+            vector = self.get_embedding(text)
+            if vector:
+                self.store[filename].append({
+                    "text": text,
+                    "vector": np.array(vector),
+                    "metadata": {"page": chunk["page"]}
+                })
+        print(f"Indexed {len(self.store[filename])} chunks for {filename}")
+    def search_similar(self, query: str, filename: str, n_results: int = 3):
+        if filename not in self.store:
+            return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
+        try:
+            query_emb = genai.embed_content(
+                model=self.model_name,
+                content=query,
+                task_type="retrieval_query"
+            )['embedding']
+            query_vec = np.array(query_emb)
+        except:
+            return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
+        scores = []
+        for item in self.store[filename]:
+            doc_vec = item["vector"]
+            score = np.dot(query_vec, doc_vec) / (np.linalg.norm(query_vec) * np.linalg.norm(doc_vec))
+            scores.append((score, item))
+        scores.sort(key=lambda x: x[0], reverse=True)
+        top_results = scores[:n_results]
+        return {
+            "documents": [[res[1]["text"] for res in top_results]],
+            "metadatas": [[res[1]["metadata"] for res in top_results]],
+            "distances": [[1 - res[0] for res in top_results]]
+        }
+# Instancia global (Singularidad)
+vector_db = InMemoryVectorStore()

clausewatch.db ADDED Viewed

Binary file (16.4 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import fitz
+import os
+import google.generativeai as genai
+from fastapi import FastAPI, UploadFile, File, HTTPException, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from sqlalchemy.orm import Session
+from pydantic import BaseModel
+from typing import List, Optional
+from deep_translator import GoogleTranslator
+from langdetect import detect
+from dotenv import load_dotenv
+from app.services.nlp_engine import nlp_engine
+from app.core.database import engine, Base, get_db
+from app.models.sql_models import AnalysisRecord
+from app.services.vector_store import vector_db
+# --- CONFIGURATION ---
+load_dotenv()
+api_key = os.getenv("API_KEY_GEMINI")
+if not api_key:
+    print("WARNING: API_KEY_GEMINI not found in .env file")
+else:
+    genai.configure(api_key=api_key.strip())
+model = genai.GenerativeModel("gemini-2.5-flash")
+# Create database tables
+Base.metadata.create_all(bind=engine)
+app = FastAPI(
+    title="ClauseWatch AI API",
+    description="API for contract analysis using deterministic NLP and Hybrid Persistence.",
+    version="1.0.0",
+)
+# --- CORS CONFIGURATION ---
+origins = [
+    "http://localhost:3000",
+    "http://127.0.0.1:3000",
+    "https://clause-watch-ia.vercel.app",
+    "https://clause-watch-ia.vercel.app/",
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- Pydantic Models ---
+class ClauseAnalysis(BaseModel):
+    text_snippet: str
+    label: str
+    confidence: float
+    is_risky: bool
+class ContractAnalysisResponse(BaseModel):
+    filename: str
+    language: str
+    risk_score: int
+    total_clauses_analyzed: int
+    risky_clauses_count: int
+    details: List[ClauseAnalysis]
+class SearchQuery(BaseModel):
+    query: str
+    filename: str
+    doc_language: str = "es"
+    top_k: int = 3
+class SearchResultItem(BaseModel):
+    text: str
+    similarity_score: float
+    metadata: dict
+class SearchResponse(BaseModel):
+    results: List[SearchResultItem]
+class ExplainRequest(BaseModel):
+    text: str
+    query: Optional[str] = None
+# --- Helper Functions ---
+def extract_text_with_metadata(file_content: bytes) -> List[dict]:
+    doc = fitz.open(stream=file_content, filetype="pdf")
+    chunks_data = []
+    for page_num, page in enumerate(doc):
+        blocks = page.get_text("blocks")
+        for block in blocks:
+            text_block = block[4].strip()
+            clean_text = " ".join(text_block.splitlines())
+            if len(clean_text) > 50:
+                # split by sentences if too long
+                if len(clean_text) > 300:
+                    sentences = clean_text.split(". ")
+                    for sentence in sentences:
+                        if len(sentence) > 30:
+                            final_sent = sentence.strip().rstrip(".") + "."
+                            chunks_data.append(
+                                {"text": final_sent, "page": page_num + 1}
+                            )
+                else:
+                    final_text = clean_text.strip().rstrip(".") + "."
+                    chunks_data.append({"text": final_text, "page": page_num + 1})
+    return chunks_data
+# --- Endpoints ---
+@app.get("/")
+def health_check():
+    return {"status": "ok", "service": "ClauseWatch AI Backend"}
+@app.post("/api/v1/analyze", response_model=ContractAnalysisResponse)
+async def analyze_contract(file: UploadFile = File(...), db: Session = Depends(get_db)):
+    # Magic Bytes Check for security
+    header = await file.read(4)
+    await file.seek(0)
+    if header != b'%PDF':
+        raise HTTPException(
+            status_code=400,
+            detail="Security Alert: File is not a valid PDF (Invalid Magic Bytes)."
+        )
+    # 1. Validation
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(
+            status_code=400, detail="Invalid file type. Only PDF allowed."
+        )
+    try:
+        content = await file.read()
+        chunks_with_meta = extract_text_with_metadata(content)
+        if not chunks_with_meta:
+            raise HTTPException(
+                status_code=400, detail="No text found in PDF. Is it scanned?"
+            )
+        # Detect Language (using first 5 chunks)
+        full_text_sample = " ".join([c["text"] for c in chunks_with_meta[:5]])
+        detected_lang = "es"
+        try:
+            detected_lang = detect(full_text_sample)
+        except:
+            pass
+        # 2. NLP Analysis (Risk Detection)
+        analyzed_clauses = []
+        risky_count = 0
+        # Limit to 100 clauses for performance
+        for item in chunks_with_meta[:100]:
+            text = item["text"]
+            result = nlp_engine.analyze_clause(text)
+            if result:
+                analyzed_clauses.append(result)
+                if result["is_risky"]:
+                    risky_count += 1
+        # Calculate Risk Score
+        total = len(analyzed_clauses)
+        risk_score = 0
+        if total > 0:
+            risk_score = int((risky_count / total) * 100)
+        # 3. Persistence Layer A: SQL (History)
+        db_record = AnalysisRecord(
+            filename=file.filename,
+            risk_score=risk_score,
+            total_clauses=total,
+            risky_clauses=risky_count,
+        )
+        db.add(db_record)
+        db.commit()
+        db.refresh(db_record)
+        # 4. Persistence Layer B: Vector Store (RAG Context)
+        try:
+            vector_db.add_contract(file.filename, chunks_with_meta)
+            print(f"Indexation complete for {file.filename}")
+        except Exception as vec_error:
+            print(f"Vector DB Error (Non-blocking): {vec_error}")
+        return ContractAnalysisResponse(
+            filename=file.filename,
+            language=detected_lang,
+            risk_score=risk_score,
+            total_clauses_analyzed=total,
+            risky_clauses_count=risky_count,
+            details=analyzed_clauses,
+        )
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/v1/history")
+def get_history(db: Session = Depends(get_db)):
+    history = (
+        db.query(AnalysisRecord)
+        .order_by(AnalysisRecord.upload_date.desc())
+        .limit(10)
+        .all()
+    )
+    return history
+@app.post("/api/v1/search", response_model=SearchResponse)
+def search_contract(search_data: SearchQuery):
+    final_query = search_data.query
+    # --- Translation Logic (User Language -> Doc Language) ---
+    try:
+        query_lang = detect(search_data.query)
+        # If user language differs from doc language, translate
+        if query_lang != search_data.doc_language:
+            translator = GoogleTranslator(
+                source="auto", target=search_data.doc_language
+            )
+            translated_text = translator.translate(search_data.query)
+            final_query = translated_text
+    except Exception as e:
+        print(f"Translation warning: {e}")
+    # ---------------------------------------------------------
+    print(f"SEARCHING: '{final_query}' in file: '{search_data.filename}'")
+    try:
+        results = vector_db.search_similar(
+            final_query, filename=search_data.filename, n_results=search_data.top_k
+        )
+        formatted_results = []
+        seen_texts = set()
+        if results and results["documents"]:
+            documents = results["documents"][0]
+            metadatas = results["metadatas"][0]
+            distances = results["distances"][0]
+            for i in range(len(documents)):
+                text_content = documents[i]
+                # Deduplication check
+                if text_content in seen_texts:
+                    continue
+                seen_texts.add(text_content)
+                formatted_results.append(
+                    {
+                        "text": text_content,
+                        "metadata": metadatas[i],
+                        "similarity_score": 1 - distances[i],
+                    }
+                )
+        return SearchResponse(results=formatted_results)
+    except Exception as e:
+        print(f"Search Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/v1/explain")
+def explain_clause(request: ExplainRequest):
+    text_snippet = request.text
+    user_question = request.query
+    print(f"Gemini explaining: {text_snippet[:30]}... (Context: {user_question})")
+    # --- DYNAMIC PROMPT CONSTRUCTION ---
+    if user_question:
+        context_instruction = f"The user has this specific question: '{user_question}'. YOUR MAIN GOAL IS TO ANSWER THIS QUESTION using the clause information."
+    else:
+        context_instruction = (
+            "The user wants to understand what this legal clause means in simple terms."
+        )
+    prompt = f"""
+    Act as an expert and friendly lawyer.
+    You have a legal clause and a user question/intent.
+    LEGAL TEXT: "{text_snippet}"
+    INSTRUCTION: {context_instruction}
+    Rules:
+    1. Use a professional but approachable tone.
+    2. Do not start with greetings or sign-offs.
+    3. **CRITICAL: Respond in the same language as the user's question (or Spanish if the question is missing).**
+    4. If you don't understand the clause, state it clearly.
+    5. If the clause answers the question, state it clearly (e.g., "Yes, you can...", "No, because...").
+    6. Explain the risk or obligation in simple terms for a general audience.
+    7. Maximum 3 lines of output.
+    """
+    try:
+        response = model.generate_content(prompt)
+        explanation = response.text.strip()
+    except Exception as e:
+        print(f"Gemini Error: {e}")
+        explanation = (
+            "Could not connect to AI Assistant. Please review the clause manually."
+        )
+    return {"explanation": explanation}
+# uvicorn main:app --reload

requirements.txt ADDED Viewed

Binary file (334 Bytes). View file