Spaces:

Krish-05
/

text-extraction-api

Sleeping

text-extraction-api / analyzers /summarizer.py

krishnachoudhary-hclguvi

Sync GitHub commit b749f19 updates

a2aa7c3 unverified about 2 months ago

6.08 kB

	"""
	Extractive text summarization using sumy library.
	Uses LexRank algorithm by default for graph-based sentence ranking.
	"""
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.lex_rank import LexRankSummarizer
	from sumy.summarizers.lsa import LsaSummarizer
	from sumy.summarizers.luhn import LuhnSummarizer
	from sumy.nlp.stemmers import Stemmer
	from sumy.utils import get_stop_words
	from models.schemas import SummaryResult
	from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM
	import config
	import time

	try:
	import google.generativeai as genai
	GEMINI_AVAILABLE = True
	except ImportError:
	GEMINI_AVAILABLE = False

	LANGUAGE = "english"


	def summarize_with_gemini(text: str) -> SummaryResult:
	"""Generate high-quality summary and key highlights using Gemini AI."""
	if not config.is_gemini_available():
	return None

	start_time = time.time()
	try:
	genai.configure(api_key=config.GEMINI_API_KEY)
	model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)

	prompt = (
	"You are an expert document analyst. Read the following text and create a highly synthesized, unique abstractive summary.\n"
	"CRITICAL INSTRUCTIONS:\n"
	"1. Do NOT just copy/paste or extract sentences verbatim from the text. Synthesize the meaning into your own words.\n"
	"2. Provide a unique, high-level overview of the entire document's core message or purpose.\n"
	"3. Structure the summary with thematic topics (e.g., Key Themes, Major Findings, Core Assertions, or document-specific domains like Experience for resumes).\n"
	"4. For each topic, provide concise insights, not just a list of extracted facts.\n"
	"5. Synthesize 3 to 7 truly unique, critical 'key points' that represent the ultimate takeaways of the document for the key_points array.\n"
	"Respond strictly in JSON format:\n"
	'{"summary": "Topic 1\\n- Insightful summary point 1...\\n\\nTopic 2\\n- Insightful summary point 2...", "key_points": ["CORE TAKEAWAY: synthesized point", ...]}'
	)

	response = model.generate_content(f"{prompt}\n\nText: {text}", generation_config={"response_mime_type": "application/json"})
	import json
	data = json.loads(response.text)

	summary = data.get("summary", "")
	key_points = data.get("key_points", [])

	if summary:
	elapsed = (time.time() - start_time) * 1000
	compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0

	return SummaryResult(
	summary=summary,
	key_points=key_points,
	original_length=len(text),
	summary_length=len(summary),
	compression_ratio=round(compression_ratio, 4),
	sentence_count=len(key_points), # Using key_points count as surrogate
	algorithm="Gemini AI (Abstractive)"
	)
	except Exception as e:
	print(f"Gemini summarization failed: {e}")

	return None


	def _get_summarizer(algorithm: str):
	"""Get the appropriate summarizer based on algorithm name."""
	stemmer = Stemmer(LANGUAGE)

	if algorithm == "lsa":
	summarizer = LsaSummarizer(stemmer)
	elif algorithm == "luhn":
	summarizer = LuhnSummarizer(stemmer)
	else: # default to lex-rank
	summarizer = LexRankSummarizer(stemmer)

	summarizer.stop_words = get_stop_words(LANGUAGE)
	return summarizer


	def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult:
	"""
	Generate an extractive or abstractive summary of the given text.
	Prioritizes Gemini if available.
	"""
	if sentence_count is None:
	sentence_count = SUMMARY_SENTENCE_COUNT
	if algorithm is None:
	algorithm = SUMMARY_ALGORITHM

	# 0. Try Gemini (Superior abstractive quality)
	if GEMINI_AVAILABLE and config.is_gemini_available():
	gemini_result = summarize_with_gemini(text)
	if gemini_result:
	return gemini_result

	# Handle short texts
	sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
	if len(sentences_in_text) <= sentence_count:
	# Text is already short enough
	clean_text = " ".join(text.split())
	return SummaryResult(
	summary=clean_text,
	original_length=len(text),
	summary_length=len(clean_text),
	compression_ratio=1.0,
	sentence_count=len(sentences_in_text),
	algorithm=algorithm,
	)

	try:
	# Parse the text
	parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
	summarizer = _get_summarizer(algorithm)

	# Generate summary
	summary_sentences = summarizer(parser.document, sentence_count)
	summary = " ".join(str(sentence) for sentence in summary_sentences)

	if not summary.strip():
	# Fallback: return first N sentences
	summary = ". ".join(sentences_in_text[:sentence_count]) + "."

	compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0

	return SummaryResult(
	summary=summary,
	original_length=len(text),
	summary_length=len(summary),
	compression_ratio=round(compression_ratio, 4),
	sentence_count=sentence_count,
	algorithm=algorithm,
	)

	except Exception as e:
	# Fallback: return first few sentences
	fallback = ". ".join(sentences_in_text[:sentence_count]) + "."
	return SummaryResult(
	summary=fallback,
	original_length=len(text),
	summary_length=len(fallback),
	compression_ratio=round(len(fallback) / len(text), 4) if len(text) > 0 else 1.0,
	sentence_count=sentence_count,
	algorithm=f"{algorithm} (fallback)",
	)