Spaces:

kn29
/

doc-processor

Sleeping

Kartik Narang

first clean commit

3cfeab7 5 months ago

5.82 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	from groq import Groq
	import re
	from nltk.tokenize import sent_tokenize
	import nltk

	# Download required NLTK data
	try:
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)
	except:
	pass

	def summarize_legal_document(text, max_sentences=5, groq_api_key=None, model_path=None):
	"""
	Summarize legal document text

	Args:
	text: Input text to summarize
	max_sentences: Maximum number of sentences in summary
	groq_api_key: Optional Groq API key for enhanced summarization
	model_path: Optional custom model path

	Returns:
	Dictionary with summary and metadata
	"""
	if not text or not text.strip():
	return {"error": "Empty text provided", "success": False}

	max_sentences = max(3, min(max_sentences, 20))

	# Initialize result
	result = {
	"original_length": len(text),
	"word_count": len(text.split()),
	"sentence_count": len(sent_tokenize(text)),
	"success": False
	}

	try:
	# Always generate extractive summary
	extractive_summary = _extractive_summarize(text, max_sentences)
	result["summary"] = extractive_summary

	# Try Groq enhancement
	if groq_api_key:
	try:
	groq_summary = _groq_summarize(text, max_sentences, groq_api_key)
	if groq_summary:
	result["summary"] = groq_summary
	except Exception:
	pass

	# Calculate final metrics
	final_summary = result.get("summary", "")
	result["summary_length"] = len(final_summary)
	result["compression_ratio"] = (
	result["summary_length"] / result["original_length"]
	if result["original_length"] > 0 else 0
	)
	result["success"] = True

	except Exception as e:
	result["error"] = str(e)
	result["success"] = False

	return result

	def _extractive_summarize(text, max_sentences):
	"""Extract key sentences based on legal document scoring"""
	sentences = sent_tokenize(text)

	if len(sentences) <= max_sentences:
	return text

	legal_keywords = [
	'court', 'judge', 'plaintiff', 'defendant', 'appellant', 'respondent',
	'held', 'ruled', 'decided', 'judgment', 'order', 'section', 'article',
	'provision', 'law', 'legal', 'case', 'appeal', 'petition', 'writ',
	'contract', 'agreement', 'liability', 'damages', 'evidence', 'witness',
	'statute', 'regulation', 'finding', 'conclusion', 'reasoning'
	]

	sentence_scores = []

	for i, sentence in enumerate(sentences):
	if not sentence.strip():
	continue

	score = 0
	sentence_lower = sentence.lower()

	# Keyword scoring
	for keyword in legal_keywords:
	if keyword in sentence_lower:
	score += 1

	# Position scoring
	if i == 0:
	score += 3
	elif i == len(sentences) - 1:
	score += 2
	elif i < len(sentences) * 0.2:
	score += 1

	# Length scoring
	word_count = len(sentence.split())
	if 15 <= word_count <= 40:
	score += 2
	elif 10 <= word_count <= 50:
	score += 1

	# Numbers and dates
	if re.search(r'\b\d{4}\b\|\b\d+\s*(percent\|%\|\$)', sentence):
	score += 1

	# Legal citations
	if re.search(r'\d+\s+[A-Z][a-z]+\.?\s+\d+\|\bv\.\s+[A-Z]', sentence):
	score += 2

	sentence_scores.append((score, i, sentence))

	# Select top sentences
	sentence_scores.sort(reverse=True, key=lambda x: x[0])
	selected_sentences = sentence_scores[:max_sentences]

	# Sort by original order
	selected_sentences.sort(key=lambda x: x[1])

	return ' '.join([sent[2] for sent in selected_sentences])

	def _groq_summarize(text, max_sentences, api_key):
	"""Enhanced summarization using Groq LLM"""
	try:
	client = Groq(api_key=api_key)

	# Truncate if too long
	if len(text) > 6000:
	text = text[:6000] + "\n[...text truncated...]"

	system_prompt = """You are an expert legal document summarizer. Create concise, accurate summaries that capture the most important information.

	Guidelines:
	1. Focus on key legal facts, holdings, and conclusions
	2. Preserve important legal terminology and concepts
	3. Maintain logical flow of legal reasoning
	4. Include relevant case citations, statutes, or regulations
	5. Be precise and avoid unnecessary elaboration"""

	user_prompt = f"""Please summarize the following legal document in approximately {max_sentences} sentences:

	{text}

	Provide a clear, concise summary:"""

	response = client.chat.completions.create(
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	model="llama-3.1-8b-instant",
	temperature=0.2,
	max_tokens=800,
	top_p=0.9
	)

	summary = response.choices[0].message.content.strip()
	if summary and len(summary) > 20:
	return summary

	except Exception:
	pass

	return None

	def _chunk_text(text, max_words):
	"""Split text into chunks for processing"""
	words = text.split()
	chunks = []

	for i in range(0, len(words), max_words):
	chunk_words = words[i:i + max_words]
	if chunk_words:
	chunks.append(' '.join(chunk_words))

	return chunks