text-extraction-api / analyzers /summarizer.py
krishnachoudhary-hclguvi
Sync GitHub commit b749f19 updates
a2aa7c3 unverified
"""
Extractive text summarization using sumy library.
Uses LexRank algorithm by default for graph-based sentence ranking.
"""
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from models.schemas import SummaryResult
from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM
import config
import time
try:
import google.generativeai as genai
GEMINI_AVAILABLE = True
except ImportError:
GEMINI_AVAILABLE = False
LANGUAGE = "english"
def summarize_with_gemini(text: str) -> SummaryResult:
"""Generate high-quality summary and key highlights using Gemini AI."""
if not config.is_gemini_available():
return None
start_time = time.time()
try:
genai.configure(api_key=config.GEMINI_API_KEY)
model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
prompt = (
"You are an expert document analyst. Read the following text and create a highly synthesized, unique abstractive summary.\n"
"CRITICAL INSTRUCTIONS:\n"
"1. Do NOT just copy/paste or extract sentences verbatim from the text. Synthesize the meaning into your own words.\n"
"2. Provide a unique, high-level overview of the entire document's core message or purpose.\n"
"3. Structure the summary with thematic topics (e.g., **Key Themes**, **Major Findings**, **Core Assertions**, or document-specific domains like **Experience** for resumes).\n"
"4. For each topic, provide concise insights, not just a list of extracted facts.\n"
"5. Synthesize 3 to 7 truly unique, critical 'key points' that represent the ultimate takeaways of the document for the key_points array.\n"
"Respond strictly in JSON format:\n"
'{"summary": "**Topic 1**\\n- Insightful summary point 1...\\n\\n**Topic 2**\\n- Insightful summary point 2...", "key_points": ["**CORE TAKEAWAY**: synthesized point", ...]}'
)
response = model.generate_content(f"{prompt}\n\nText: {text}", generation_config={"response_mime_type": "application/json"})
import json
data = json.loads(response.text)
summary = data.get("summary", "")
key_points = data.get("key_points", [])
if summary:
elapsed = (time.time() - start_time) * 1000
compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0
return SummaryResult(
summary=summary,
key_points=key_points,
original_length=len(text),
summary_length=len(summary),
compression_ratio=round(compression_ratio, 4),
sentence_count=len(key_points), # Using key_points count as surrogate
algorithm="Gemini AI (Abstractive)"
)
except Exception as e:
print(f"Gemini summarization failed: {e}")
return None
def _get_summarizer(algorithm: str):
"""Get the appropriate summarizer based on algorithm name."""
stemmer = Stemmer(LANGUAGE)
if algorithm == "lsa":
summarizer = LsaSummarizer(stemmer)
elif algorithm == "luhn":
summarizer = LuhnSummarizer(stemmer)
else: # default to lex-rank
summarizer = LexRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
return summarizer
def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult:
"""
Generate an extractive or abstractive summary of the given text.
Prioritizes Gemini if available.
"""
if sentence_count is None:
sentence_count = SUMMARY_SENTENCE_COUNT
if algorithm is None:
algorithm = SUMMARY_ALGORITHM
# 0. Try Gemini (Superior abstractive quality)
if GEMINI_AVAILABLE and config.is_gemini_available():
gemini_result = summarize_with_gemini(text)
if gemini_result:
return gemini_result
# Handle short texts
sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
if len(sentences_in_text) <= sentence_count:
# Text is already short enough
clean_text = " ".join(text.split())
return SummaryResult(
summary=clean_text,
original_length=len(text),
summary_length=len(clean_text),
compression_ratio=1.0,
sentence_count=len(sentences_in_text),
algorithm=algorithm,
)
try:
# Parse the text
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = _get_summarizer(algorithm)
# Generate summary
summary_sentences = summarizer(parser.document, sentence_count)
summary = " ".join(str(sentence) for sentence in summary_sentences)
if not summary.strip():
# Fallback: return first N sentences
summary = ". ".join(sentences_in_text[:sentence_count]) + "."
compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0
return SummaryResult(
summary=summary,
original_length=len(text),
summary_length=len(summary),
compression_ratio=round(compression_ratio, 4),
sentence_count=sentence_count,
algorithm=algorithm,
)
except Exception as e:
# Fallback: return first few sentences
fallback = ". ".join(sentences_in_text[:sentence_count]) + "."
return SummaryResult(
summary=fallback,
original_length=len(text),
summary_length=len(fallback),
compression_ratio=round(len(fallback) / len(text), 4) if len(text) > 0 else 1.0,
sentence_count=sentence_count,
algorithm=f"{algorithm} (fallback)",
)