Spaces:
Running
Running
| import logging | |
| from typing import Optional, List, Dict, Any | |
| import torch | |
| from .preprocessing import TextPreprocessor, TechnicalDocumentParser | |
| from .models import SummarizationModelLoader, IntentClassifier, ContextPreserver | |
| from .utils import chunk_text | |
| from .evaluation import SummaryEvaluator | |
| from .keywords import KeywordExtractor | |
| from .exporters import SummaryExporter | |
| from .rag import RAGPipeline, ContextPreserver as RAGContextPreserver | |
| from .model_selector import ModelSelector | |
| from .intent_engine import ( | |
| get_intent_config, | |
| get_level_config, | |
| get_quality_config, | |
| extract_intent_relevant_text, | |
| build_t5_input, | |
| postprocess_summary, | |
| translate_summary, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class TechnicalDocumentSummarizer: | |
| """Main summarization pipeline for technical documents with language support.""" | |
| def __init__(self, model_name: str = 't5-small', device: Optional[str] = None, language: str = None): | |
| """ | |
| Initialize the summarizer. | |
| Args: | |
| model_name: Name of the model to use (default: t5-small for speed) | |
| device: Device to run on ('cpu' or 'cuda') | |
| language: Language for summarization (defaults to english if not provided) | |
| """ | |
| # Only prompt for language if running interactively and language not provided | |
| if language is None: | |
| import sys | |
| if sys.stdin.isatty(): # Check if running in interactive terminal | |
| print("\n=== LANGUAGE SELECTION ===") | |
| print("Supported languages: english, spanish, french, german, italian,") | |
| print("portuguese, chinese, japanese, korean, arabic, hindi, russian, turkish, vietnamese, thai") | |
| language = input("\nEnter desired language for summarization (default: english): ").strip() or 'english' | |
| else: | |
| language = 'english' # Default to English in non-interactive mode | |
| self.model_name = model_name | |
| self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') | |
| self.language = language | |
| self.preprocessor = TextPreprocessor(remove_stopwords=False) | |
| self.parser = TechnicalDocumentParser() | |
| self.model_loader = SummarizationModelLoader(model_name, self.device, language) | |
| self.intent_classifier = IntentClassifier() | |
| self.context_preserver = ContextPreserver() | |
| self.model, self.tokenizer = self.model_loader.load_model() | |
| self.evaluator = SummaryEvaluator() | |
| self.keyword_extractor = KeywordExtractor() | |
| self.exporter = SummaryExporter() | |
| self.rag_pipeline = RAGPipeline() | |
| self.model_selector = ModelSelector() | |
| self.model_cache = {} | |
| logger.info(f"Summarizer initialized with {model_name} for {language}") | |
| def auto_summarize( | |
| self, | |
| document: str, | |
| intent: str = 'technical_overview', | |
| quality_preference: str = 'balanced', | |
| summary_level: str = 'brief', | |
| language: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Auto-select best model and summarize with full intent / level / quality / language support. | |
| """ | |
| recommendation = self.model_selector.recommend_settings(document, quality_preference) | |
| logger.info(f"Model={recommendation['model']} | quality={quality_preference} | intent={intent} | level={summary_level}") | |
| use_rag = recommendation.get('use_rag', False) | |
| # Merge level + quality + intent configs for generation params | |
| level_cfg = get_level_config(summary_level) | |
| quality_cfg = get_quality_config(quality_preference) | |
| max_length = level_cfg['max_length'] | |
| min_length = level_cfg['min_length'] | |
| num_beams = max(recommendation.get('num_beams', 2), quality_cfg['num_beams']) | |
| summary = self.summarize( | |
| document, | |
| intent=intent, | |
| summary_level=summary_level, | |
| quality_preference=quality_preference, | |
| max_length=max_length, | |
| min_length=min_length, | |
| num_beams=num_beams, | |
| language=language, | |
| use_rag=use_rag, | |
| ) | |
| return { | |
| 'summary': summary, | |
| 'model': recommendation['model'], | |
| 'complexity': str(self.model_selector.current_complexity), | |
| 'use_rag': use_rag, | |
| 'estimated_time': recommendation['estimated_time'], | |
| 'reason': recommendation['reason'], | |
| } | |
| def summarize( | |
| self, | |
| document: str, | |
| intent: str = 'technical_overview', | |
| summary_level: str = 'brief', | |
| quality_preference: str = 'balanced', | |
| max_length: int = 130, | |
| min_length: int = 50, | |
| num_beams: int = 3, | |
| language: Optional[str] = None, | |
| use_rag: bool = False, | |
| ) -> str: | |
| """ | |
| Full intent-aware summarization pipeline: | |
| 1. Intent pre-filtering (sentence selection) | |
| 2. Optional RAG context retrieval | |
| 3. Model generation with quality-tuned params | |
| 4. Intent + level post-processing | |
| 5. Language translation (if non-English) | |
| """ | |
| if language and language != self.language: | |
| self.model_loader.language = language | |
| self.model_loader.language_code = self.model_loader.SUPPORTED_LANGUAGES.get( | |
| language.lower(), 'en_XX' | |
| ) | |
| if hasattr(self.tokenizer, 'src_lang'): | |
| self.tokenizer.src_lang = self.model_loader.language_code | |
| # Validate intent string | |
| if isinstance(intent, str): | |
| intent = self.intent_classifier.classify_intent(intent) | |
| # ── Step 1: Intent-aware pre-filtering ────────────────────────────── | |
| intent_text = extract_intent_relevant_text(document, intent, max_chars=3000) | |
| logger.info(f"[Summarize] intent={intent} level={summary_level} quality={quality_preference}") | |
| logger.info(f"[Summarize] pre-filter: {len(intent_text)} chars selected") | |
| # ── Step 2: Optional RAG ──────────────────────────────────────────── | |
| if use_rag: | |
| indexing_stats = self.rag_pipeline.index_document(intent_text) | |
| logger.info(f"RAG indexed: {indexing_stats}") | |
| intent_prompt = self.intent_classifier.get_prompt_for_intent(intent) | |
| retrieved_chunks = self.rag_pipeline.retrieve_context(intent_prompt, k=3) | |
| summary_text = self.rag_pipeline.merge_context( | |
| [chunk for chunk, _ in retrieved_chunks], | |
| [score for _, score in retrieved_chunks] | |
| ) | |
| else: | |
| summary_text = intent_text | |
| # ── Step 3: Generate ──────────────────────────────────────────────── | |
| quality_cfg = get_quality_config(quality_preference) | |
| raw_summary = self._generate_summary( | |
| summary_text, intent, max_length, min_length, | |
| num_beams, quality_cfg, | |
| ) | |
| # ── Step 4: Post-process (intent format + level formatting) ───────── | |
| formatted = postprocess_summary(raw_summary, intent, summary_level) | |
| # ── Step 5: Translate if non-English ──────────────────────────────── | |
| target_lang = language or self.language | |
| if target_lang and target_lang.lower() not in ('english', 'en'): | |
| formatted = translate_summary(formatted, target_lang) | |
| return formatted | |
| def _prepare_for_summarization( | |
| self, | |
| abstract: str, | |
| text: str, | |
| max_length: int | |
| ) -> str: | |
| """ | |
| Prepare text for summarization, handling long documents. | |
| Args: | |
| abstract: Document abstract if available | |
| text: Main document text | |
| max_length: Target max length | |
| Returns: | |
| Prepared text for summarization | |
| """ | |
| prepared = abstract if abstract else "" | |
| max_tokens = 512 * 2 | |
| if len(text) > 4000: | |
| chunks = chunk_text(text, chunk_size=1000, overlap=100) | |
| prepared += " ".join(chunks[:2]) | |
| else: | |
| prepared += " " + text | |
| return prepared.strip() | |
| def _generate_summary( | |
| self, | |
| text: str, | |
| intent: str, | |
| max_length: int, | |
| min_length: int, | |
| num_beams: int, | |
| quality_cfg: Optional[Dict[str, Any]] = None, | |
| ) -> str: | |
| """ | |
| Generate summary using the intent-aware T5 prefix. | |
| quality_cfg controls beam search and repetition penalty. | |
| """ | |
| if quality_cfg is None: | |
| quality_cfg = get_quality_config('balanced') | |
| input_text = build_t5_input(text, intent) | |
| inputs = self.tokenizer( | |
| input_text, | |
| return_tensors='pt', | |
| max_length=512, | |
| truncation=True, | |
| padding='max_length' | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| summary_ids = self.model.generate( | |
| inputs['input_ids'], | |
| attention_mask=inputs.get('attention_mask'), | |
| max_length=max_length, | |
| min_length=min_length, | |
| num_beams=num_beams, | |
| early_stopping=True, | |
| do_sample=False, | |
| no_repeat_ngram_size=quality_cfg.get('no_repeat_ngram_size', 3), | |
| length_penalty=quality_cfg.get('length_penalty', 1.2), | |
| ) | |
| summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| logger.info(f"[Generate] intent={intent} | words={len(summary.split())} | beams={num_beams}") | |
| return summary | |
| def _simplify_language(self, text: str) -> str: | |
| """ | |
| Simplify language to make summary easy to understand. | |
| Args: | |
| text: Text to simplify | |
| Returns: | |
| Simplified text | |
| """ | |
| simplifications = { | |
| 'utilize': 'use', | |
| 'demonstrate': 'show', | |
| 'implement': 'create', | |
| 'facilitate': 'help', | |
| 'novel': 'new', | |
| 'proposed': 'suggested', | |
| 'efficacy': 'effectiveness', | |
| 'robust': 'strong', | |
| 'comprehensive': 'complete', | |
| 'subsequent': 'next', | |
| 'aforementioned': 'mentioned', | |
| 'henceforth': 'from now on', | |
| } | |
| simplified = text | |
| for complex_word, simple_word in simplifications.items(): | |
| import re | |
| simplified = re.sub( | |
| r'\b' + complex_word + r'\b', | |
| simple_word, | |
| simplified, | |
| flags=re.IGNORECASE | |
| ) | |
| import re | |
| sentences = re.split(r'(?<=[.!?])\s+', simplified) | |
| simplified_sentences = [] | |
| for sentence in sentences: | |
| if len(sentence) > 120: # Split very long sentences | |
| parts = sentence.split(' and ') | |
| if len(parts) > 1: | |
| simplified_sentences.extend(parts) | |
| else: | |
| simplified_sentences.append(sentence) | |
| else: | |
| simplified_sentences.append(sentence) | |
| return ' '.join(simplified_sentences) | |
| def summarize_batch( | |
| self, | |
| documents: List[str], | |
| intent: str = 'technical_overview', | |
| language: Optional[str] = None, | |
| return_keywords: bool = False | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Summarize multiple documents efficiently (batch processing). | |
| Args: | |
| documents: List of documents to summarize | |
| intent: Summarization intent | |
| language: Language for summarization | |
| return_keywords: Extract keywords for each | |
| Returns: | |
| List of summary results | |
| """ | |
| logger.info(f"Batch summarizing {len(documents)} documents...") | |
| results = [] | |
| for i, doc in enumerate(documents, 1): | |
| try: | |
| result = self.summarize( | |
| doc, | |
| intent=intent, | |
| language=language | |
| ) | |
| results.append(result) | |
| logger.info(f"Processed {i}/{len(documents)}") | |
| except Exception as e: | |
| logger.error(f"Error processing document {i}: {str(e)}") | |
| results.append({'error': str(e)}) | |
| logger.info(f"Batch summarization complete") | |
| return results | |
| def _format_as_bullets(self, summary: str) -> str: | |
| """ | |
| Format summary as bullet points. | |
| Args: | |
| summary: Summary text | |
| Returns: | |
| Formatted as bullets | |
| """ | |
| sentences = summary.split('.') | |
| bullets = [f"• {s.strip()}" for s in sentences if s.strip()] | |
| return '\n'.join(bullets) | |
| def summarize_with_sections( | |
| self, | |
| document: str, | |
| max_length_per_section: int = 100 | |
| ) -> Dict[str, str]: | |
| """ | |
| Summarize document with separate summaries for each section. | |
| Args: | |
| document: Document text | |
| max_length_per_section: Max length for each section summary | |
| Returns: | |
| Dictionary of section summaries | |
| """ | |
| sections = self.parser.extract_sections(document) | |
| summaries = {} | |
| for section_title, section_content in sections: | |
| if section_content.strip(): | |
| summary = self.summarize( | |
| section_content, | |
| max_length=max_length_per_section | |
| ) | |
| summaries[section_title] = summary | |
| return summaries | |
| def get_model_info(self) -> Dict[str, Any]: | |
| """Get information about the loaded model.""" | |
| return self.model_loader.get_model_info() | |