Spaces:
Sleeping
Sleeping
File size: 6,077 Bytes
52a0fe9 a2aa7c3 52a0fe9 a2aa7c3 52a0fe9 a2aa7c3 52a0fe9 a2aa7c3 52a0fe9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """
Extractive text summarization using sumy library.
Uses LexRank algorithm by default for graph-based sentence ranking.
"""
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from models.schemas import SummaryResult
from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM
import config
import time
try:
import google.generativeai as genai
GEMINI_AVAILABLE = True
except ImportError:
GEMINI_AVAILABLE = False
LANGUAGE = "english"
def summarize_with_gemini(text: str) -> SummaryResult:
"""Generate high-quality summary and key highlights using Gemini AI."""
if not config.is_gemini_available():
return None
start_time = time.time()
try:
genai.configure(api_key=config.GEMINI_API_KEY)
model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
prompt = (
"You are an expert document analyst. Read the following text and create a highly synthesized, unique abstractive summary.\n"
"CRITICAL INSTRUCTIONS:\n"
"1. Do NOT just copy/paste or extract sentences verbatim from the text. Synthesize the meaning into your own words.\n"
"2. Provide a unique, high-level overview of the entire document's core message or purpose.\n"
"3. Structure the summary with thematic topics (e.g., **Key Themes**, **Major Findings**, **Core Assertions**, or document-specific domains like **Experience** for resumes).\n"
"4. For each topic, provide concise insights, not just a list of extracted facts.\n"
"5. Synthesize 3 to 7 truly unique, critical 'key points' that represent the ultimate takeaways of the document for the key_points array.\n"
"Respond strictly in JSON format:\n"
'{"summary": "**Topic 1**\\n- Insightful summary point 1...\\n\\n**Topic 2**\\n- Insightful summary point 2...", "key_points": ["**CORE TAKEAWAY**: synthesized point", ...]}'
)
response = model.generate_content(f"{prompt}\n\nText: {text}", generation_config={"response_mime_type": "application/json"})
import json
data = json.loads(response.text)
summary = data.get("summary", "")
key_points = data.get("key_points", [])
if summary:
elapsed = (time.time() - start_time) * 1000
compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0
return SummaryResult(
summary=summary,
key_points=key_points,
original_length=len(text),
summary_length=len(summary),
compression_ratio=round(compression_ratio, 4),
sentence_count=len(key_points), # Using key_points count as surrogate
algorithm="Gemini AI (Abstractive)"
)
except Exception as e:
print(f"Gemini summarization failed: {e}")
return None
def _get_summarizer(algorithm: str):
"""Get the appropriate summarizer based on algorithm name."""
stemmer = Stemmer(LANGUAGE)
if algorithm == "lsa":
summarizer = LsaSummarizer(stemmer)
elif algorithm == "luhn":
summarizer = LuhnSummarizer(stemmer)
else: # default to lex-rank
summarizer = LexRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
return summarizer
def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult:
"""
Generate an extractive or abstractive summary of the given text.
Prioritizes Gemini if available.
"""
if sentence_count is None:
sentence_count = SUMMARY_SENTENCE_COUNT
if algorithm is None:
algorithm = SUMMARY_ALGORITHM
# 0. Try Gemini (Superior abstractive quality)
if GEMINI_AVAILABLE and config.is_gemini_available():
gemini_result = summarize_with_gemini(text)
if gemini_result:
return gemini_result
# Handle short texts
sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
if len(sentences_in_text) <= sentence_count:
# Text is already short enough
clean_text = " ".join(text.split())
return SummaryResult(
summary=clean_text,
original_length=len(text),
summary_length=len(clean_text),
compression_ratio=1.0,
sentence_count=len(sentences_in_text),
algorithm=algorithm,
)
try:
# Parse the text
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = _get_summarizer(algorithm)
# Generate summary
summary_sentences = summarizer(parser.document, sentence_count)
summary = " ".join(str(sentence) for sentence in summary_sentences)
if not summary.strip():
# Fallback: return first N sentences
summary = ". ".join(sentences_in_text[:sentence_count]) + "."
compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0
return SummaryResult(
summary=summary,
original_length=len(text),
summary_length=len(summary),
compression_ratio=round(compression_ratio, 4),
sentence_count=sentence_count,
algorithm=algorithm,
)
except Exception as e:
# Fallback: return first few sentences
fallback = ". ".join(sentences_in_text[:sentence_count]) + "."
return SummaryResult(
summary=fallback,
original_length=len(text),
summary_length=len(fallback),
compression_ratio=round(len(fallback) / len(text), 4) if len(text) > 0 else 1.0,
sentence_count=sentence_count,
algorithm=f"{algorithm} (fallback)",
)
|