Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 4

Commit

6bbd6b4

verified ·

1 Parent(s): deeb866

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -14

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ import numpy as np
 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
-from fastapi.responses import FileResponse, JSONResponse, HTMLResponse  # Added HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
@@ -31,6 +31,9 @@ from starlette.concurrency import run_in_threadpool
 import gensim
 from gensim import corpora, models
 # Global cache for analysis results based on file hash
 analysis_cache = {}
@@ -197,15 +200,13 @@ try:
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
-    # Update summarizer to use the LED model for long-document summarization
-    from transformers import LEDTokenizer
     summarizer = pipeline(
         "summarization",
-        model="allenai/led-large-16384",
-        tokenizer="allenai/led-large-16384",
         device=0 if torch.cuda.is_available() else -1
     )
-    # Optionally convert summarizer model to FP16 for faster inference on GPU (if supported)
     if device == "cuda":
         try:
             summarizer.model.half()
@@ -235,8 +236,6 @@ except Exception as e:
 from transformers import pipeline
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Initialize sentiment-analysis pipeline
 sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if torch.cuda.is_available() else -1)
 def legal_chatbot(user_input, context):
@@ -263,10 +262,8 @@ async def process_video_to_text(video_file_path):
             "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
             temp_audio_path, "-y"
         ]
-        # Run ffmpeg in a separate thread
         await run_in_threadpool(subprocess.run, cmd, check=True)
         print(f"Audio extracted to {temp_audio_path}")
-        # Run speech-to-text in threadpool
         result = await run_in_threadpool(speech_to_text, temp_audio_path)
         transcript = result["text"]
         print(f"Transcription completed: {len(transcript)} characters")
@@ -326,11 +323,61 @@ def get_enhanced_context_info(text):
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
 def analyze_risk_enhanced(text):
     enhanced = get_enhanced_context_info(text)
     avg_sentiment = enhanced["average_sentiment"]
     risk_score = abs(avg_sentiment) if avg_sentiment < 0 else 0
-    return {"risk_score": risk_score, "average_sentiment": avg_sentiment, "topics": enhanced["topics"]}
 def analyze_contract_clauses(text):
     max_length = 512
@@ -370,7 +417,6 @@ async def analyze_legal_document(file: UploadFile = File(...)):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
-        # Return cached result if available
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
@@ -594,10 +640,8 @@ async def download_clause_radar_chart(task_id: str):
         clauses = analyze_contract_clauses(text)
         if not clauses:
             raise HTTPException(status_code=404, detail="No clauses detected.")
-        # For radar chart, use clause types and their confidence scores
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
-        # To close the radar chart, repeat the first value and label
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()

 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse, HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
 import gensim
 from gensim import corpora, models
+# Import spacy stop words
+from spacy.lang.en.stop_words import STOP_WORDS
 # Global cache for analysis results based on file hash
 analysis_cache = {}
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
+    # Update summarizer to use facebook/bart-large-cnn for summarization
     summarizer = pipeline(
         "summarization",
+        model="facebook/bart-large-cnn",
+        tokenizer="facebook/bart-large-cnn",
         device=0 if torch.cuda.is_available() else -1
     )
     if device == "cuda":
         try:
             summarizer.model.half()
 from transformers import pipeline
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if torch.cuda.is_available() else -1)
 def legal_chatbot(user_input, context):
             "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
             temp_audio_path, "-y"
         ]
         await run_in_threadpool(subprocess.run, cmd, check=True)
         print(f"Audio extracted to {temp_audio_path}")
         result = await run_in_threadpool(speech_to_text, temp_audio_path)
         transcript = result["text"]
         print(f"Transcription completed: {len(transcript)} characters")
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
+# New function to create a detailed, dynamic explanation for each topic
+def explain_topics(topics):
+    explanation = {}
+    for topic_idx, topic_str in topics:
+        # Split topic string into individual weighted terms
+        parts = topic_str.split('+')
+        terms = []
+        for part in parts:
+            part = part.strip()
+            if '*' in part:
+                weight_str, word = part.split('*', 1)
+                word = word.strip().strip('\"').strip('\'')
+                try:
+                    weight = float(weight_str)
+                except:
+                    weight = 0.0
+                # Filter out common stop words
+                if word.lower() not in STOP_WORDS and len(word) > 1:
+                    terms.append((weight, word))
+        terms.sort(key=lambda x: -x[0])
+        # Create a plain language label based on dominant words
+        if terms:
+            if any("liability" in word.lower() for weight, word in terms):
+                label = "Liability & Penalty Risk"
+            elif any("termination" in word.lower() for weight, word in terms):
+                label = "Termination & Refund Risk"
+            elif any("compliance" in word.lower() for weight, word in terms):
+                label = "Compliance & Regulatory Risk"
+            else:
+                label = "General Risk Language"
+        else:
+            label = "General Risk Language"
+        explanation_text = (
+            f"Topic {topic_idx} ({label}) is characterized by dominant terms: " +
+            ", ".join([f"'{word}' ({weight:.3f})" for weight, word in terms[:5]])
+        )
+        explanation[topic_idx] = {
+            "label": label,
+            "explanation": explanation_text,
+            "terms": terms
+        }
+    return explanation
 def analyze_risk_enhanced(text):
     enhanced = get_enhanced_context_info(text)
     avg_sentiment = enhanced["average_sentiment"]
     risk_score = abs(avg_sentiment) if avg_sentiment < 0 else 0
+    topics_raw = enhanced["topics"]
+    topics_explanation = explain_topics(topics_raw)
+    return {
+        "risk_score": risk_score,
+        "average_sentiment": avg_sentiment,
+        "topics": topics_raw,
+        "topics_explanation": topics_explanation
+    }
 def analyze_contract_clauses(text):
     max_length = 512
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
         clauses = analyze_contract_clauses(text)
         if not clauses:
             raise HTTPException(status_code=404, detail="No clauses detected.")
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()