import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from groq import Groq import re from nltk.tokenize import sent_tokenize import nltk # Download required NLTK data try: nltk.download('punkt', quiet=True) nltk.download('punkt_tab', quiet=True) except: pass def summarize_legal_document(text, max_sentences=5, groq_api_key=None, model_path=None): """ Summarize legal document text Args: text: Input text to summarize max_sentences: Maximum number of sentences in summary groq_api_key: Optional Groq API key for enhanced summarization model_path: Optional custom model path Returns: Dictionary with summary and metadata """ if not text or not text.strip(): return {"error": "Empty text provided", "success": False} max_sentences = max(3, min(max_sentences, 20)) # Initialize result result = { "original_length": len(text), "word_count": len(text.split()), "sentence_count": len(sent_tokenize(text)), "success": False } try: # Always generate extractive summary extractive_summary = _extractive_summarize(text, max_sentences) result["summary"] = extractive_summary # Try Groq enhancement if groq_api_key: try: groq_summary = _groq_summarize(text, max_sentences, groq_api_key) if groq_summary: result["summary"] = groq_summary except Exception: pass # Calculate final metrics final_summary = result.get("summary", "") result["summary_length"] = len(final_summary) result["compression_ratio"] = ( result["summary_length"] / result["original_length"] if result["original_length"] > 0 else 0 ) result["success"] = True except Exception as e: result["error"] = str(e) result["success"] = False return result def _extractive_summarize(text, max_sentences): """Extract key sentences based on legal document scoring""" sentences = sent_tokenize(text) if len(sentences) <= max_sentences: return text legal_keywords = [ 'court', 'judge', 'plaintiff', 'defendant', 'appellant', 'respondent', 'held', 'ruled', 'decided', 'judgment', 'order', 'section', 'article', 'provision', 'law', 'legal', 'case', 'appeal', 'petition', 'writ', 'contract', 'agreement', 'liability', 'damages', 'evidence', 'witness', 'statute', 'regulation', 'finding', 'conclusion', 'reasoning' ] sentence_scores = [] for i, sentence in enumerate(sentences): if not sentence.strip(): continue score = 0 sentence_lower = sentence.lower() # Keyword scoring for keyword in legal_keywords: if keyword in sentence_lower: score += 1 # Position scoring if i == 0: score += 3 elif i == len(sentences) - 1: score += 2 elif i < len(sentences) * 0.2: score += 1 # Length scoring word_count = len(sentence.split()) if 15 <= word_count <= 40: score += 2 elif 10 <= word_count <= 50: score += 1 # Numbers and dates if re.search(r'\b\d{4}\b|\b\d+\s*(percent|%|\$)', sentence): score += 1 # Legal citations if re.search(r'\d+\s+[A-Z][a-z]+\.?\s+\d+|\bv\.\s+[A-Z]', sentence): score += 2 sentence_scores.append((score, i, sentence)) # Select top sentences sentence_scores.sort(reverse=True, key=lambda x: x[0]) selected_sentences = sentence_scores[:max_sentences] # Sort by original order selected_sentences.sort(key=lambda x: x[1]) return ' '.join([sent[2] for sent in selected_sentences]) def _groq_summarize(text, max_sentences, api_key): """Enhanced summarization using Groq LLM""" try: client = Groq(api_key=api_key) # Truncate if too long if len(text) > 6000: text = text[:6000] + "\n[...text truncated...]" system_prompt = """You are an expert legal document summarizer. Create concise, accurate summaries that capture the most important information. Guidelines: 1. Focus on key legal facts, holdings, and conclusions 2. Preserve important legal terminology and concepts 3. Maintain logical flow of legal reasoning 4. Include relevant case citations, statutes, or regulations 5. Be precise and avoid unnecessary elaboration""" user_prompt = f"""Please summarize the following legal document in approximately {max_sentences} sentences: {text} Provide a clear, concise summary:""" response = client.chat.completions.create( messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], model="llama-3.1-8b-instant", temperature=0.2, max_tokens=800, top_p=0.9 ) summary = response.choices[0].message.content.strip() if summary and len(summary) > 20: return summary except Exception: pass return None def _chunk_text(text, max_words): """Split text into chunks for processing""" words = text.split() chunks = [] for i in range(0, len(words), max_words): chunk_words = words[i:i + max_words] if chunk_words: chunks.append(' '.join(chunk_words)) return chunks