Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 6, 2025

Commit

74d93ff

verified ·

1 Parent(s): 5916467

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -14

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ except Exception:
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
-# Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize FastAPI
@@ -64,16 +64,13 @@ app.add_middleware(
 document_storage = {}
 chat_history = []
-# Function to store document context by task ID
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
-# Function to load document context by task ID
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
-# Utility to compute MD5 hash from file content
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
@@ -196,17 +193,14 @@ try:
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
-    # Use Facebook's bart-large-cnn for summarization
     summarizer = pipeline(
         "summarization",
-        model="facebook/bart-large-cnn",
-        tokenizer="facebook/bart-large-cnn",
         device=0 if torch.cuda.is_available() else -1
     )
-    # Removed FP16 conversion for summarizer to avoid CUDA errors
-    # if device == "cuda":
-    #     summarizer.model.half()
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
@@ -373,7 +367,10 @@ async def analyze_legal_document(file: UploadFile = File(...)):
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Document too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
@@ -411,7 +408,10 @@ async def analyze_legal_video(file: UploadFile = File(...), background_tasks: Ba
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
@@ -451,7 +451,10 @@ async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: Ba
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)

 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
+# Set device to GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize FastAPI
 document_storage = {}
 chat_history = []
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
+    # Use T5-base for summarization and run it on GPU (device=0)
     summarizer = pipeline(
         "summarization",
+        model="t5-base",
+        tokenizer="t5-base",
         device=0 if torch.cuda.is_available() else -1
     )
+    # Do NOT convert the summarizer model to FP16 to reduce risk of CUDA errors
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary_result = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)
+        summary = summary_result[0].get("summary_text", "")
+        if not summary:
+            summary = "Summary not generated. Please check the input text."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary_result = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)
+        summary = summary_result[0].get("summary_text", "")
+        if not summary:
+            summary = "Summary not generated. Please check the input transcript."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary_result = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)
+        summary = summary_result[0].get("summary_text", "")
+        if not summary:
+            summary = "Summary not generated. Please check the input transcript."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)