Spaces:
Sleeping
Sleeping
| """ | |
| Extractive text summarization using sumy library. | |
| Uses LexRank algorithm by default for graph-based sentence ranking. | |
| """ | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from sumy.summarizers.luhn import LuhnSummarizer | |
| from sumy.nlp.stemmers import Stemmer | |
| from sumy.utils import get_stop_words | |
| from models.schemas import SummaryResult | |
| from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM | |
| import config | |
| import time | |
| try: | |
| import google.generativeai as genai | |
| GEMINI_AVAILABLE = True | |
| except ImportError: | |
| GEMINI_AVAILABLE = False | |
| LANGUAGE = "english" | |
| def summarize_with_gemini(text: str) -> SummaryResult: | |
| """Generate high-quality summary and key highlights using Gemini AI.""" | |
| if not config.is_gemini_available(): | |
| return None | |
| start_time = time.time() | |
| try: | |
| genai.configure(api_key=config.GEMINI_API_KEY) | |
| model = genai.GenerativeModel(config.GEMINI_MODEL_NAME) | |
| prompt = ( | |
| "You are an expert document analyst. Read the following text and create a highly synthesized, unique abstractive summary.\n" | |
| "CRITICAL INSTRUCTIONS:\n" | |
| "1. Do NOT just copy/paste or extract sentences verbatim from the text. Synthesize the meaning into your own words.\n" | |
| "2. Provide a unique, high-level overview of the entire document's core message or purpose.\n" | |
| "3. Structure the summary with thematic topics (e.g., **Key Themes**, **Major Findings**, **Core Assertions**, or document-specific domains like **Experience** for resumes).\n" | |
| "4. For each topic, provide concise insights, not just a list of extracted facts.\n" | |
| "5. Synthesize 3 to 7 truly unique, critical 'key points' that represent the ultimate takeaways of the document for the key_points array.\n" | |
| "Respond strictly in JSON format:\n" | |
| '{"summary": "**Topic 1**\\n- Insightful summary point 1...\\n\\n**Topic 2**\\n- Insightful summary point 2...", "key_points": ["**CORE TAKEAWAY**: synthesized point", ...]}' | |
| ) | |
| response = model.generate_content(f"{prompt}\n\nText: {text}", generation_config={"response_mime_type": "application/json"}) | |
| import json | |
| data = json.loads(response.text) | |
| summary = data.get("summary", "") | |
| key_points = data.get("key_points", []) | |
| if summary: | |
| elapsed = (time.time() - start_time) * 1000 | |
| compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0 | |
| return SummaryResult( | |
| summary=summary, | |
| key_points=key_points, | |
| original_length=len(text), | |
| summary_length=len(summary), | |
| compression_ratio=round(compression_ratio, 4), | |
| sentence_count=len(key_points), # Using key_points count as surrogate | |
| algorithm="Gemini AI (Abstractive)" | |
| ) | |
| except Exception as e: | |
| print(f"Gemini summarization failed: {e}") | |
| return None | |
| def _get_summarizer(algorithm: str): | |
| """Get the appropriate summarizer based on algorithm name.""" | |
| stemmer = Stemmer(LANGUAGE) | |
| if algorithm == "lsa": | |
| summarizer = LsaSummarizer(stemmer) | |
| elif algorithm == "luhn": | |
| summarizer = LuhnSummarizer(stemmer) | |
| else: # default to lex-rank | |
| summarizer = LexRankSummarizer(stemmer) | |
| summarizer.stop_words = get_stop_words(LANGUAGE) | |
| return summarizer | |
| def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult: | |
| """ | |
| Generate an extractive or abstractive summary of the given text. | |
| Prioritizes Gemini if available. | |
| """ | |
| if sentence_count is None: | |
| sentence_count = SUMMARY_SENTENCE_COUNT | |
| if algorithm is None: | |
| algorithm = SUMMARY_ALGORITHM | |
| # 0. Try Gemini (Superior abstractive quality) | |
| if GEMINI_AVAILABLE and config.is_gemini_available(): | |
| gemini_result = summarize_with_gemini(text) | |
| if gemini_result: | |
| return gemini_result | |
| # Handle short texts | |
| sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()] | |
| if len(sentences_in_text) <= sentence_count: | |
| # Text is already short enough | |
| clean_text = " ".join(text.split()) | |
| return SummaryResult( | |
| summary=clean_text, | |
| original_length=len(text), | |
| summary_length=len(clean_text), | |
| compression_ratio=1.0, | |
| sentence_count=len(sentences_in_text), | |
| algorithm=algorithm, | |
| ) | |
| try: | |
| # Parse the text | |
| parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) | |
| summarizer = _get_summarizer(algorithm) | |
| # Generate summary | |
| summary_sentences = summarizer(parser.document, sentence_count) | |
| summary = " ".join(str(sentence) for sentence in summary_sentences) | |
| if not summary.strip(): | |
| # Fallback: return first N sentences | |
| summary = ". ".join(sentences_in_text[:sentence_count]) + "." | |
| compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0 | |
| return SummaryResult( | |
| summary=summary, | |
| original_length=len(text), | |
| summary_length=len(summary), | |
| compression_ratio=round(compression_ratio, 4), | |
| sentence_count=sentence_count, | |
| algorithm=algorithm, | |
| ) | |
| except Exception as e: | |
| # Fallback: return first few sentences | |
| fallback = ". ".join(sentences_in_text[:sentence_count]) + "." | |
| return SummaryResult( | |
| summary=fallback, | |
| original_length=len(text), | |
| summary_length=len(fallback), | |
| compression_ratio=round(len(fallback) / len(text), 4) if len(text) > 0 else 1.0, | |
| sentence_count=sentence_count, | |
| algorithm=f"{algorithm} (fallback)", | |
| ) | |