Nilyzz commited on
Commit
306e475
·
1 Parent(s): 12091ef

Add files

Browse files
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY . .
11
+
12
+ RUN useradd -m -u 1000 user
13
+ USER user
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+
17
+ EXPOSE 7860
18
+
19
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app/core/__pycache__/database.cpython-312.pyc ADDED
Binary file (870 Bytes). View file
 
app/core/database.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker
4
+
5
+ # SQLite database file in the project root
6
+ SQLALCHEMY_DATABASE_URL = "sqlite:///./clausewatch.db"
7
+
8
+ engine = create_engine(
9
+ SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
10
+ )
11
+
12
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
13
+
14
+ Base = declarative_base()
15
+
16
+ # Dependency to get DB session in endpoints
17
+ def get_db():
18
+ db = SessionLocal()
19
+ try:
20
+ yield db
21
+ finally:
22
+ db.close()
app/models/__pycache__/sql_models.cpython-312.pyc ADDED
Binary file (918 Bytes). View file
 
app/models/sql_models.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, Float, DateTime
2
+ from datetime import datetime
3
+ from app.core.database import Base
4
+
5
+ class AnalysisRecord(Base):
6
+ __tablename__ = "analysis_history"
7
+
8
+ id = Column(Integer, primary_key=True, index=True)
9
+ filename = Column(String, index=True)
10
+ upload_date = Column(DateTime, default=datetime.utcnow)
11
+ risk_score = Column(Integer)
12
+ total_clauses = Column(Integer)
13
+ risky_clauses = Column(Integer)
app/services/__pycache__/nlp_engine.cpython-312.pyc ADDED
Binary file (4.21 kB). View file
 
app/services/__pycache__/vector_store.cpython-312.pyc ADDED
Binary file (3.77 kB). View file
 
app/services/nlp_engine.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch.nn.functional as F
4
+
5
+ class LegalNLPEngine:
6
+
7
+ def __init__(self):
8
+ self.model_name = "nlpaueb/legal-bert-base-uncased"
9
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ print(f"Loading NLP Model: {self.model_name} on {self.device}...")
12
+
13
+ # 1. TOKENIZER: Converts text to numbers
14
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
15
+
16
+ # 2. MODEL: The neural network
17
+ self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=2)
18
+ self.model.to(self.device)
19
+ self.model.eval()
20
+
21
+ def analyze_clause(self, text: str):
22
+ if not text or len(text) < 10:
23
+ return None
24
+
25
+ # --- Rules heuristics ---
26
+ text_lower = text.lower()
27
+
28
+ risky_keywords = [
29
+ "modificación unilateral", "exención total de responsabilidad",
30
+ "venta de datos", "renuncia a derechos", "demandas colectivas",
31
+ "arbitraje privado", "sin previo aviso", "no se hace responsable",
32
+ "derecho irrevocable", "renunciando a la jurisdicción",
33
+ "indemnización", "sin compensación", "datos a terceros"
34
+ ]
35
+
36
+ safe_keywords = [
37
+ "horario", "jornada", "fecha", "nombre", "domicilio",
38
+ "dni", "firmado", "en prueba", "convenio", "trabajador",
39
+ "vacaciones", "nómina", "seguridad social", "protección de datos",
40
+ "anexo", "contrato", "acuerdo", "estipulaciones", "cláusula",
41
+ "firmando", "lugar y fecha", "reunidos"
42
+ ]
43
+
44
+ if any(k in text_lower for k in risky_keywords):
45
+ return {
46
+ "text_snippet": text[:100] + "...",
47
+ "label": "POTENTIAL_RISK",
48
+ "confidence": 0.95,
49
+ "is_risky": True
50
+ }
51
+
52
+ if any(k in text_lower for k in safe_keywords):
53
+ return {
54
+ "text_snippet": text[:100] + "...",
55
+ "label": "ACCEPTABLE",
56
+ "confidence": 0.90,
57
+ "is_risky": False
58
+ }
59
+
60
+ # ---IA BERT ---
61
+ try:
62
+ # Tokenization
63
+ inputs = self.tokenizer(
64
+ text,
65
+ return_tensors="pt",
66
+ truncation=True,
67
+ max_length=512,
68
+ padding=True
69
+ ).to(self.device)
70
+
71
+ # Inference (Pass through the neural network)
72
+ with torch.no_grad():
73
+ outputs = self.model(**inputs)
74
+
75
+ probs = F.softmax(outputs.logits, dim=1)
76
+
77
+ risk_score = probs[0][1].item()
78
+
79
+ is_risky_ai = risk_score > 0.55
80
+
81
+ return {
82
+ "text_snippet": text[:100] + "...",
83
+ "label": "AI_DETECTED_RISK" if is_risky_ai else "AI_CLEARED",
84
+ "confidence": round(float(max(probs[0])), 2),
85
+ "is_risky": is_risky_ai
86
+ }
87
+
88
+ except Exception as e:
89
+ # Fallback
90
+ return {
91
+ "text_snippet": text[:100] + "...",
92
+ "label": "NEUTRAL",
93
+ "confidence": 0.0,
94
+ "is_risky": False
95
+ }
96
+
97
+ # Singleton instance
98
+ nlp_engine = LegalNLPEngine()
app/services/vector_store.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import numpy as np
3
+ import os
4
+
5
+ class InMemoryVectorStore:
6
+ def __init__(self):
7
+ self.store = {}
8
+
9
+ self.model_name = "models/text-embedding-004"
10
+
11
+ def get_embedding(self, text):
12
+ try:
13
+ result = genai.embed_content(
14
+ model=self.model_name,
15
+ content=text,
16
+ task_type="retrieval_document"
17
+ )
18
+ return result['embedding']
19
+ except Exception as e:
20
+ print(f"Error getting embedding: {e}")
21
+ return []
22
+
23
+ def add_contract(self, filename: str, chunks: list):
24
+ print(f"Indexing {filename} using Google Embeddings...")
25
+
26
+ self.store[filename] = []
27
+
28
+ for chunk in chunks:
29
+ text = chunk["text"]
30
+ vector = self.get_embedding(text)
31
+
32
+ if vector:
33
+ self.store[filename].append({
34
+ "text": text,
35
+ "vector": np.array(vector),
36
+ "metadata": {"page": chunk["page"]}
37
+ })
38
+
39
+ print(f"Indexed {len(self.store[filename])} chunks for {filename}")
40
+
41
+ def search_similar(self, query: str, filename: str, n_results: int = 3):
42
+ if filename not in self.store:
43
+ return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
44
+
45
+ try:
46
+ query_emb = genai.embed_content(
47
+ model=self.model_name,
48
+ content=query,
49
+ task_type="retrieval_query"
50
+ )['embedding']
51
+ query_vec = np.array(query_emb)
52
+ except:
53
+ return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
54
+
55
+ scores = []
56
+ for item in self.store[filename]:
57
+ doc_vec = item["vector"]
58
+ score = np.dot(query_vec, doc_vec) / (np.linalg.norm(query_vec) * np.linalg.norm(doc_vec))
59
+ scores.append((score, item))
60
+
61
+ scores.sort(key=lambda x: x[0], reverse=True)
62
+ top_results = scores[:n_results]
63
+
64
+ return {
65
+ "documents": [[res[1]["text"] for res in top_results]],
66
+ "metadatas": [[res[1]["metadata"] for res in top_results]],
67
+ "distances": [[1 - res[0] for res in top_results]]
68
+ }
69
+
70
+ # Instancia global (Singularidad)
71
+ vector_db = InMemoryVectorStore()
clausewatch.db ADDED
Binary file (16.4 kB). View file
 
main.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import os
3
+ import google.generativeai as genai
4
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Depends
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from sqlalchemy.orm import Session
7
+ from pydantic import BaseModel
8
+ from typing import List, Optional
9
+ from deep_translator import GoogleTranslator
10
+ from langdetect import detect
11
+ from dotenv import load_dotenv
12
+ from app.services.nlp_engine import nlp_engine
13
+ from app.core.database import engine, Base, get_db
14
+ from app.models.sql_models import AnalysisRecord
15
+ from app.services.vector_store import vector_db
16
+
17
+
18
+ # --- CONFIGURATION ---
19
+ load_dotenv()
20
+
21
+ api_key = os.getenv("API_KEY_GEMINI")
22
+ if not api_key:
23
+ print("WARNING: API_KEY_GEMINI not found in .env file")
24
+ else:
25
+ genai.configure(api_key=api_key.strip())
26
+
27
+ model = genai.GenerativeModel("gemini-2.5-flash")
28
+
29
+ # Create database tables
30
+ Base.metadata.create_all(bind=engine)
31
+
32
+ app = FastAPI(
33
+ title="ClauseWatch AI API",
34
+ description="API for contract analysis using deterministic NLP and Hybrid Persistence.",
35
+ version="1.0.0",
36
+ )
37
+
38
+ # --- CORS CONFIGURATION ---
39
+ origins = [
40
+ "http://localhost:3000",
41
+ "http://127.0.0.1:3000",
42
+ "https://clause-watch-ia.vercel.app",
43
+ "https://clause-watch-ia.vercel.app/",
44
+ ]
45
+
46
+ app.add_middleware(
47
+ CORSMiddleware,
48
+ allow_origins=origins,
49
+ allow_credentials=True,
50
+ allow_methods=["*"],
51
+ allow_headers=["*"],
52
+ )
53
+
54
+
55
+ # --- Pydantic Models ---
56
+ class ClauseAnalysis(BaseModel):
57
+ text_snippet: str
58
+ label: str
59
+ confidence: float
60
+ is_risky: bool
61
+
62
+
63
+ class ContractAnalysisResponse(BaseModel):
64
+ filename: str
65
+ language: str
66
+ risk_score: int
67
+ total_clauses_analyzed: int
68
+ risky_clauses_count: int
69
+ details: List[ClauseAnalysis]
70
+
71
+
72
+ class SearchQuery(BaseModel):
73
+ query: str
74
+ filename: str
75
+ doc_language: str = "es"
76
+ top_k: int = 3
77
+
78
+
79
+ class SearchResultItem(BaseModel):
80
+ text: str
81
+ similarity_score: float
82
+ metadata: dict
83
+
84
+
85
+ class SearchResponse(BaseModel):
86
+ results: List[SearchResultItem]
87
+
88
+
89
+ class ExplainRequest(BaseModel):
90
+ text: str
91
+ query: Optional[str] = None
92
+
93
+
94
+ # --- Helper Functions ---
95
+ def extract_text_with_metadata(file_content: bytes) -> List[dict]:
96
+
97
+ doc = fitz.open(stream=file_content, filetype="pdf")
98
+ chunks_data = []
99
+
100
+ for page_num, page in enumerate(doc):
101
+ blocks = page.get_text("blocks")
102
+
103
+ for block in blocks:
104
+ text_block = block[4].strip()
105
+
106
+ clean_text = " ".join(text_block.splitlines())
107
+
108
+ if len(clean_text) > 50:
109
+ # split by sentences if too long
110
+ if len(clean_text) > 300:
111
+ sentences = clean_text.split(". ")
112
+ for sentence in sentences:
113
+ if len(sentence) > 30:
114
+ final_sent = sentence.strip().rstrip(".") + "."
115
+
116
+ chunks_data.append(
117
+ {"text": final_sent, "page": page_num + 1}
118
+ )
119
+ else:
120
+ final_text = clean_text.strip().rstrip(".") + "."
121
+ chunks_data.append({"text": final_text, "page": page_num + 1})
122
+
123
+ return chunks_data
124
+
125
+
126
+ # --- Endpoints ---
127
+ @app.get("/")
128
+ def health_check():
129
+ return {"status": "ok", "service": "ClauseWatch AI Backend"}
130
+
131
+
132
+ @app.post("/api/v1/analyze", response_model=ContractAnalysisResponse)
133
+ async def analyze_contract(file: UploadFile = File(...), db: Session = Depends(get_db)):
134
+
135
+ # Magic Bytes Check for security
136
+ header = await file.read(4)
137
+ await file.seek(0)
138
+
139
+ if header != b'%PDF':
140
+ raise HTTPException(
141
+ status_code=400,
142
+ detail="Security Alert: File is not a valid PDF (Invalid Magic Bytes)."
143
+ )
144
+
145
+ # 1. Validation
146
+ if not file.filename.endswith(".pdf"):
147
+ raise HTTPException(
148
+ status_code=400, detail="Invalid file type. Only PDF allowed."
149
+ )
150
+
151
+ try:
152
+ content = await file.read()
153
+ chunks_with_meta = extract_text_with_metadata(content)
154
+
155
+ if not chunks_with_meta:
156
+ raise HTTPException(
157
+ status_code=400, detail="No text found in PDF. Is it scanned?"
158
+ )
159
+
160
+ # Detect Language (using first 5 chunks)
161
+ full_text_sample = " ".join([c["text"] for c in chunks_with_meta[:5]])
162
+ detected_lang = "es"
163
+ try:
164
+ detected_lang = detect(full_text_sample)
165
+ except:
166
+ pass
167
+
168
+ # 2. NLP Analysis (Risk Detection)
169
+ analyzed_clauses = []
170
+ risky_count = 0
171
+
172
+ # Limit to 100 clauses for performance
173
+ for item in chunks_with_meta[:100]:
174
+ text = item["text"]
175
+ result = nlp_engine.analyze_clause(text)
176
+
177
+ if result:
178
+ analyzed_clauses.append(result)
179
+ if result["is_risky"]:
180
+ risky_count += 1
181
+
182
+ # Calculate Risk Score
183
+ total = len(analyzed_clauses)
184
+ risk_score = 0
185
+ if total > 0:
186
+ risk_score = int((risky_count / total) * 100)
187
+
188
+ # 3. Persistence Layer A: SQL (History)
189
+ db_record = AnalysisRecord(
190
+ filename=file.filename,
191
+ risk_score=risk_score,
192
+ total_clauses=total,
193
+ risky_clauses=risky_count,
194
+ )
195
+ db.add(db_record)
196
+ db.commit()
197
+ db.refresh(db_record)
198
+
199
+ # 4. Persistence Layer B: Vector Store (RAG Context)
200
+ try:
201
+ vector_db.add_contract(file.filename, chunks_with_meta)
202
+ print(f"Indexation complete for {file.filename}")
203
+ except Exception as vec_error:
204
+ print(f"Vector DB Error (Non-blocking): {vec_error}")
205
+
206
+ return ContractAnalysisResponse(
207
+ filename=file.filename,
208
+ language=detected_lang,
209
+ risk_score=risk_score,
210
+ total_clauses_analyzed=total,
211
+ risky_clauses_count=risky_count,
212
+ details=analyzed_clauses,
213
+ )
214
+
215
+ except Exception as e:
216
+ print(f"Error processing file: {e}")
217
+ raise HTTPException(status_code=500, detail=str(e))
218
+
219
+
220
+ @app.get("/api/v1/history")
221
+ def get_history(db: Session = Depends(get_db)):
222
+ history = (
223
+ db.query(AnalysisRecord)
224
+ .order_by(AnalysisRecord.upload_date.desc())
225
+ .limit(10)
226
+ .all()
227
+ )
228
+ return history
229
+
230
+
231
+ @app.post("/api/v1/search", response_model=SearchResponse)
232
+ def search_contract(search_data: SearchQuery):
233
+ final_query = search_data.query
234
+
235
+ # --- Translation Logic (User Language -> Doc Language) ---
236
+ try:
237
+ query_lang = detect(search_data.query)
238
+ # If user language differs from doc language, translate
239
+ if query_lang != search_data.doc_language:
240
+ translator = GoogleTranslator(
241
+ source="auto", target=search_data.doc_language
242
+ )
243
+ translated_text = translator.translate(search_data.query)
244
+ final_query = translated_text
245
+ except Exception as e:
246
+ print(f"Translation warning: {e}")
247
+ # ---------------------------------------------------------
248
+
249
+ print(f"SEARCHING: '{final_query}' in file: '{search_data.filename}'")
250
+
251
+ try:
252
+ results = vector_db.search_similar(
253
+ final_query, filename=search_data.filename, n_results=search_data.top_k
254
+ )
255
+
256
+ formatted_results = []
257
+ seen_texts = set()
258
+
259
+ if results and results["documents"]:
260
+ documents = results["documents"][0]
261
+ metadatas = results["metadatas"][0]
262
+ distances = results["distances"][0]
263
+
264
+ for i in range(len(documents)):
265
+ text_content = documents[i]
266
+
267
+ # Deduplication check
268
+ if text_content in seen_texts:
269
+ continue
270
+
271
+ seen_texts.add(text_content)
272
+
273
+ formatted_results.append(
274
+ {
275
+ "text": text_content,
276
+ "metadata": metadatas[i],
277
+ "similarity_score": 1 - distances[i],
278
+ }
279
+ )
280
+
281
+ return SearchResponse(results=formatted_results)
282
+
283
+ except Exception as e:
284
+ print(f"Search Error: {e}")
285
+ raise HTTPException(status_code=500, detail=str(e))
286
+
287
+
288
+ @app.post("/api/v1/explain")
289
+ def explain_clause(request: ExplainRequest):
290
+ text_snippet = request.text
291
+ user_question = request.query
292
+
293
+ print(f"Gemini explaining: {text_snippet[:30]}... (Context: {user_question})")
294
+
295
+ # --- DYNAMIC PROMPT CONSTRUCTION ---
296
+ if user_question:
297
+ context_instruction = f"The user has this specific question: '{user_question}'. YOUR MAIN GOAL IS TO ANSWER THIS QUESTION using the clause information."
298
+ else:
299
+ context_instruction = (
300
+ "The user wants to understand what this legal clause means in simple terms."
301
+ )
302
+
303
+ prompt = f"""
304
+ Act as an expert and friendly lawyer.
305
+ You have a legal clause and a user question/intent.
306
+
307
+ LEGAL TEXT: "{text_snippet}"
308
+
309
+ INSTRUCTION: {context_instruction}
310
+
311
+ Rules:
312
+ 1. Use a professional but approachable tone.
313
+ 2. Do not start with greetings or sign-offs.
314
+ 3. **CRITICAL: Respond in the same language as the user's question (or Spanish if the question is missing).**
315
+ 4. If you don't understand the clause, state it clearly.
316
+ 5. If the clause answers the question, state it clearly (e.g., "Yes, you can...", "No, because...").
317
+ 6. Explain the risk or obligation in simple terms for a general audience.
318
+ 7. Maximum 3 lines of output.
319
+ """
320
+
321
+ try:
322
+ response = model.generate_content(prompt)
323
+ explanation = response.text.strip()
324
+ except Exception as e:
325
+ print(f"Gemini Error: {e}")
326
+ explanation = (
327
+ "Could not connect to AI Assistant. Please review the clause manually."
328
+ )
329
+
330
+ return {"explanation": explanation}
331
+
332
+
333
+ # uvicorn main:app --reload
requirements.txt ADDED
Binary file (334 Bytes). View file