Spaces:
Sleeping
Sleeping
| from pydantic import BaseModel | |
| from fastapi import FastAPI, HTTPException | |
| from typing import Annotated, Literal | |
| from pydantic import Field | |
| from fastapi.responses import JSONResponse | |
| import numpy as np | |
| from transformers import pipeline | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from transformers import AutoConfig | |
| import torch.nn.functional as F | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi import UploadFile, File | |
| import fitz | |
| from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering | |
| pdf_cache = {"text": None} | |
| def pdfopen(filepath : str) -> str: | |
| doc = fitz.open(filepath) | |
| text = "" | |
| for page in doc: | |
| text = text + page.get_text() | |
| doc.close() | |
| return text.strip() | |
| def clean_short(ans): | |
| words = ans.split() | |
| return " ".join(words[:3]) | |
| def summarizer(): | |
| summaryp = "krrishsinha/legal_summariser" | |
| o = pipeline("summarization", model= summaryp) | |
| return o | |
| def anq(): | |
| model_name = "google/flan-t5-large" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| return (tokenizer, model) | |
| def clause(sen): | |
| clausep = "krrishsinha/clausedetectionfinal" | |
| tokenizer = AutoTokenizer.from_pretrained(clausep) | |
| model = AutoModelForSequenceClassification.from_pretrained(clausep) | |
| config = AutoConfig.from_pretrained(clausep) | |
| inputs = tokenizer(sen, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| logits = outputs.logits | |
| probs = F.softmax(logits, dim=1).squeeze().tolist() | |
| pred_id = int(torch.argmax(logits, dim=1).item()) | |
| predicted_label = config.id2label.get(pred_id, f"LABEL_{pred_id}") | |
| return predicted_label | |
| class summariser(BaseModel): | |
| pdf : Annotated[str, Field(..., description = "here goes your pdf")] | |
| class qna(BaseModel): | |
| question : Annotated[str, Field(..., description = "here goes your question regarding the document you want to ask")] | |
| class clausedetection(BaseModel): | |
| text : Annotated[str, Field(..., description = "here goes your text for detecting its clause")] | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| def welcome(): | |
| return {"welcome to Lawlytics" : "AI Corporate Legal Document Intelligence"} | |
| async def uploading(file : UploadFile = File(...)): | |
| try: | |
| file_path = f"./{file.filename}" | |
| with open(file_path, "wb") as f: | |
| content = await file.read() | |
| f.write(content) | |
| t = pdfopen(file_path) | |
| if not t: | |
| raise HTTPException(status_code=400, detail="No text found in PDF. Maybe it's scanned?") | |
| pdf_cache["text"] = t | |
| return {"message": "PDF uploaded & text extracted successfully", "characters_extracted": len(t)} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def summary(): | |
| txt = pdf_cache["text"] | |
| if not txt: | |
| raise HTTPException(status_code=400, detail="No PDF text found. Upload PDF first.") | |
| p = summarizer() | |
| e = p (txt, | |
| max_length=100, | |
| min_length=30, | |
| do_sample=False | |
| ) | |
| return {"summary": e} | |
| def quesans(py: qna): | |
| txt2 = pdf_cache["text"] | |
| if not txt2: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="No PDF text found. Upload PDF first." | |
| ) | |
| tokenizer, model = anq() | |
| # FLAN-T5 prompt | |
| prompt = ( | |
| f"Provide the answer in only 1 to 3 words.\n" | |
| f"Question: {py.question}\n" | |
| f"Context: {txt2}\n" | |
| f"Answer:" | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=20, | |
| num_beams=5, | |
| early_stopping=True | |
| ) | |
| answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Final small cleanup (optional but useful) | |
| answer = clean_short(answer) | |
| return {"answer": answer} | |
| def clausing(l : clausedetection): | |
| text3 = l.text or pdf_cache["text"] | |
| if not text3: | |
| raise HTTPException(status_code=400, detail="Provide text or upload PDF first.") | |
| a = clause(sen = text3) | |
| return {"detected clause" : a} | |