Spaces:
Running
Running
| """ | |
| Intent-Aware Engine (v2) | |
| ========================== | |
| Drives genuinely different output per intent by combining: | |
| 1. Pre-filtering β sentence-level keyword scoring + section detection | |
| 2. Prompt shaping β T5/BART-compatible instruction prefix per intent | |
| 3. Level handling β executive / brief / detailed / bullets control length & format | |
| 4. Post-processing β format, deduplicate, and label the raw model output | |
| 5. Translation β lightweight deep-translator for non-English output | |
| Intent catalogue | |
| ---------------- | |
| technical_overview High-level: what the system/paper does, key technologies | |
| detailed_analysis Deep-dive: methods, architecture, design decisions, equations | |
| methodology HOW: algorithm, pipeline, dataset, training, evaluation setup | |
| results WHAT was found: numbers, benchmarks, metrics, comparisons | |
| conclusion So what: conclusions, implications, limitations, future work | |
| abstract Compact academic abstract: problem β approach β result | |
| """ | |
| import re | |
| import logging | |
| from typing import List, Tuple, Dict, Optional | |
| logger = logging.getLogger(__name__) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Summary-level configuration | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LEVEL_CONFIG = { | |
| "executive": {"min_length": 20, "max_length": 60, "num_beams": 3, "label": "Executive Summary"}, | |
| "brief": {"min_length": 50, "max_length": 130, "num_beams": 3, "label": "Brief Summary"}, | |
| "detailed": {"min_length": 100,"max_length": 280, "num_beams": 4, "label": "Detailed Summary"}, | |
| "bullets": {"min_length": 60, "max_length": 200, "num_beams": 4, "label": "Bullet Points"}, | |
| } | |
| _DEFAULT_LEVEL = LEVEL_CONFIG["brief"] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Quality-mode β generation params | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| QUALITY_CONFIG = { | |
| "speed": {"num_beams": 2, "no_repeat_ngram_size": 2, "length_penalty": 1.0}, | |
| "balanced": {"num_beams": 4, "no_repeat_ngram_size": 3, "length_penalty": 1.2}, | |
| "quality": {"num_beams": 6, "no_repeat_ngram_size": 3, "length_penalty": 1.5}, | |
| } | |
| _DEFAULT_QUALITY = QUALITY_CONFIG["balanced"] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Intent configuration table | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| INTENT_CONFIG: Dict[str, dict] = { | |
| "technical_overview": { | |
| "t5_prefix": "summarize the key technical contributions, system design, and core technologies: ", | |
| "focus_keywords": [ | |
| "propose", "present", "introduce", "system", "framework", "architecture", | |
| "approach", "model", "method", "technique", "design", "develop", "built", | |
| "based on", "consists of", "leverages", "uses", "overview", "we propose", | |
| "this paper", "this work", "we present", "contribution", "novel", | |
| ], | |
| "section_priority": ["abstract", "introduction", "overview", "system", "contribution"], | |
| "label": "Technical Overview", | |
| "postprocess": "standard", | |
| }, | |
| "detailed_analysis": { | |
| "t5_prefix": ( | |
| "provide a comprehensive technical analysis covering the architecture, " | |
| "implementation details, design choices, and key algorithms: " | |
| ), | |
| "focus_keywords": [ | |
| "layer", "encoder", "decoder", "attention", "network", "module", | |
| "equation", "formula", "parameter", "trained", "fine-tune", "embedding", | |
| "vector", "dimension", "kernel", "function", "optimization", "gradient", | |
| "loss", "activation", "transformer", "convolutional", "recurrent", | |
| "algorithm", "complexity", "implementation", "configuration", | |
| "mechanism", "architecture", "block", "head", "weight", "hidden", | |
| ], | |
| "section_priority": ["method", "approach", "model", "architecture", "implementation", "detail"], | |
| "label": "Detailed Technical Analysis", | |
| "postprocess": "detailed", | |
| }, | |
| "methodology": { | |
| "t5_prefix": ( | |
| "summarize the methodology, experimental setup, dataset used, " | |
| "training procedure, evaluation metrics, and pipeline: " | |
| ), | |
| "focus_keywords": [ | |
| "dataset", "corpus", "training", "evaluation", "experiment", "setup", | |
| "pipeline", "procedure", "step", "process", "baseline", "split", | |
| "validation", "test", "train", "fine-tun", "pre-train", "annotation", | |
| "collect", "sample", "batch", "epoch", "hyperparameter", "learning rate", | |
| "benchmark", "protocol", "augmentation", "split", "fold", "cross-validation", | |
| ], | |
| "section_priority": ["method", "methodology", "experiment", "setup", "data", "training"], | |
| "label": "Methodology & Experimental Setup", | |
| "postprocess": "methodology", | |
| }, | |
| "results": { | |
| "t5_prefix": ( | |
| "summarize the experimental results, key quantitative findings, " | |
| "performance metrics, and comparisons with baselines: " | |
| ), | |
| "focus_keywords": [ | |
| "result", "performance", "accuracy", "score", "f1", "bleu", "rouge", | |
| "precision", "recall", "achieves", "outperform", "beats", "surpass", | |
| "improves", "gain", "%", "percent", "state-of-the-art", "sota", | |
| "baseline", "comparison", "ablation", "demonstrate", "show", | |
| "table", "figure", "metric", "evaluation", "benchmark", "error rate", | |
| "recall", "map", "ndcg", "auc", "roc", | |
| ], | |
| "section_priority": ["result", "experiment", "evaluation", "performance", "finding", "ablation"], | |
| "label": "Results & Findings", | |
| "postprocess": "results", | |
| }, | |
| "conclusion": { | |
| "t5_prefix": ( | |
| "summarize the conclusions, key takeaways, limitations of the work, " | |
| "and potential future research directions: " | |
| ), | |
| "focus_keywords": [ | |
| "conclude", "conclusion", "summary", "finding", "contribution", | |
| "implication", "limitation", "future", "direction", "work", | |
| "suggest", "recommend", "open problem", "challenge", "prospect", | |
| "overall", "in summary", "in conclusion", "we show", "we demonstrate", | |
| "our work", "this paper", "insight", "takeaway", "impact", | |
| ], | |
| "section_priority": ["conclusion", "summary", "discussion", "future", "limitation"], | |
| "label": "Conclusions & Implications", | |
| "postprocess": "conclusion", | |
| }, | |
| "abstract": { | |
| "t5_prefix": ( | |
| "write a concise abstract covering the problem statement, proposed approach, " | |
| "key results, and significance of the work: " | |
| ), | |
| "focus_keywords": [ | |
| "propose", "present", "address", "problem", "challenge", "solution", | |
| "achieve", "result", "outperform", "significant", "novel", | |
| "contribution", "state-of-the-art", "demonstrate", "this paper", | |
| "we present", "we propose", "this work", "task", "application", | |
| ], | |
| "section_priority": ["abstract", "introduction", "conclusion"], | |
| "label": "Abstract", | |
| "postprocess": "abstract", | |
| }, | |
| } | |
| _DEFAULT_INTENT = INTENT_CONFIG["technical_overview"] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Public config accessors | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_intent_config(intent: str) -> dict: | |
| return INTENT_CONFIG.get(intent, _DEFAULT_INTENT) | |
| def get_level_config(level: str) -> dict: | |
| return LEVEL_CONFIG.get(level, _DEFAULT_LEVEL) | |
| def get_quality_config(quality: str) -> dict: | |
| return QUALITY_CONFIG.get(quality, _DEFAULT_QUALITY) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Sentence utilities | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _split_sentences(text: str) -> List[str]: | |
| raw = re.split(r"(?<=[.!?])\s+", text.strip()) | |
| return [s.strip() for s in raw if len(s.strip()) > 25] | |
| def _score_sentences(sentences: List[str], keywords: List[str]) -> List[Tuple[str, float]]: | |
| kw_lower = [k.lower() for k in keywords] | |
| scored = [] | |
| for sent in sentences: | |
| sl = sent.lower() | |
| score = sum(1.5 if len(kw) > 6 else 1.0 for kw in kw_lower if kw in sl) | |
| scored.append((sent, score)) | |
| return scored | |
| def _detect_sections(text: str) -> Dict[str, str]: | |
| heading_re = re.compile( | |
| r"^(#{1,4}\s+.+|(?:abstract|introduction|background|related work|" | |
| r"methodology|method|approach|experiment|result|evaluation|" | |
| r"discussion|conclusion|limitation|future work|references)" | |
| r"[:\s]*$)", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| parts = heading_re.split(text) | |
| sections: Dict[str, str] = {} | |
| i = 0 | |
| current = "body" | |
| while i < len(parts): | |
| part = parts[i].strip() | |
| if heading_re.match(part): | |
| current = part.lower().strip("#").strip() | |
| i += 1 | |
| if i < len(parts): | |
| sections[current] = parts[i].strip() | |
| i += 1 | |
| else: | |
| sections.setdefault(current, "") | |
| sections[current] += " " + part | |
| i += 1 | |
| return {k: v.strip() for k, v in sections.items() if v.strip()} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Core pre-filtering | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_intent_relevant_text( | |
| document: str, | |
| intent: str, | |
| max_chars: int = 3000, | |
| ) -> str: | |
| """ | |
| Select the most intent-relevant sentences from the document. | |
| Strategy: | |
| 1. Detect named sections and prioritise intent-relevant ones. | |
| 2. Score every sentence against intent keywords. | |
| 3. Take top-scoring sentences (preserving original order). | |
| """ | |
| cfg = get_intent_config(intent) | |
| keywords: List[str] = cfg["focus_keywords"] | |
| section_priority: List[str] = cfg["section_priority"] | |
| sections = _detect_sections(document) | |
| # Pull high-priority section text first | |
| priority_text = "" | |
| for priority in section_priority: | |
| for sec_name, sec_text in sections.items(): | |
| if priority in sec_name.lower(): | |
| priority_text += " " + sec_text | |
| break | |
| all_text = (priority_text + " " + document).strip() | |
| sentences = _split_sentences(all_text) | |
| if not sentences: | |
| return document[:max_chars] | |
| scored = _score_sentences(sentences, keywords) | |
| sorted_scored = sorted(scored, key=lambda x: x[1], reverse=True) | |
| # Take at least 1/3 of sentences, never fewer than 6 | |
| top_n = max(6, len(sentences) // 3) | |
| # Include all sentences that scored > 0 up to top_n | |
| top_set = {s for s, sc in sorted_scored[:top_n] if sc > 0} | |
| if not top_set: | |
| # Nothing scored β return the best section text or full text | |
| return (priority_text or all_text)[:max_chars] | |
| # Restore original order | |
| ordered = [s for s in sentences if s in top_set] | |
| result = " ".join(ordered) | |
| if len(result) > max_chars: | |
| result = result[:max_chars].rsplit(" ", 1)[0] | |
| logger.info( | |
| f"[IntentEngine] intent={intent}, total={len(sentences)}, " | |
| f"selected={len(ordered)}, chars={len(result)}" | |
| ) | |
| return result or all_text[:max_chars] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # T5 input construction | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_t5_input(filtered_text: str, intent: str) -> str: | |
| """Combine the intent-specific prefix with the pre-filtered text.""" | |
| prefix = get_intent_config(intent)["t5_prefix"] | |
| return f"{prefix}{filtered_text}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Post-processing | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _clean_raw(text: str) -> str: | |
| """Basic cleanup: collapse whitespace and deduplicate sentences.""" | |
| text = re.sub(r"\s{2,}", " ", text.strip()) | |
| seen: set = set() | |
| deduped = [] | |
| for sent in re.split(r"(?<=[.!?])\s+", text): | |
| key = sent.lower().strip() | |
| if key and key not in seen: | |
| seen.add(key) | |
| deduped.append(sent.strip()) | |
| return " ".join(deduped) | |
| def _to_bullets(text: str) -> str: | |
| """Convert a paragraph into a bullet-point list.""" | |
| sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 15] | |
| if not sentences: | |
| return text | |
| return "\n".join(f"β’ {s}" for s in sentences) | |
| def _to_numbered(text: str) -> str: | |
| sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 15] | |
| if len(sentences) < 2: | |
| return text | |
| return "\n".join(f"{i+1}. {s}" for i, s in enumerate(sentences)) | |
| def postprocess_summary(raw_summary: str, intent: str, summary_level: str = "brief") -> str: | |
| """ | |
| Apply intent + level specific post-processing to the raw model output. | |
| Modes (from intent): | |
| standard β label + clean paragraph | |
| detailed β label + longer clean paragraph | |
| methodology β numbered steps | |
| results β metric highlighting with βΊ | |
| conclusion β "In conclusion, β¦" framing | |
| abstract β "This work β¦" academic framing | |
| Summary level overrides: | |
| bullets β always convert to β’ bullet list | |
| executive β trim to first 1-2 sentences | |
| """ | |
| cfg = get_intent_config(intent) | |
| mode = cfg.get("postprocess", "standard") | |
| label = cfg["label"] | |
| summary = _clean_raw(raw_summary) | |
| # ββ Bullets level overrides everything β convert to bullet list ββ | |
| if summary_level == "bullets": | |
| summary = _to_bullets(summary) | |
| return f"[{label}]\n{summary}" | |
| # ββ Executive level β very short, first 2 sentences max ββ | |
| if summary_level == "executive": | |
| sentences = [s for s in re.split(r"(?<=[.!?])\s+", summary) if s.strip()] | |
| summary = " ".join(sentences[:2]) | |
| return f"[{label}]\n{summary}" | |
| # ββ Intent-specific formatting ββ | |
| if mode == "detailed": | |
| if summary and not summary[-1] in ".!?": | |
| summary += "." | |
| return f"[Detailed Analysis]\n{summary}" | |
| elif mode == "methodology": | |
| numbered = _to_numbered(summary) | |
| return f"[Methodology]\n{numbered}" | |
| elif mode == "results": | |
| # Highlight numbers and metric names | |
| highlighted = re.sub( | |
| r"(\d+\.?\d*\s*%|\b\d+\.\d+\b|\b(?:accuracy|f1|bleu|rouge|score|precision|recall|auc)[^.]*\d[^.]*\.?)", | |
| lambda m: f"βΊ {m.group(0).strip()}", | |
| summary, | |
| flags=re.IGNORECASE, | |
| ) | |
| return f"[Results & Findings]\n{highlighted}" | |
| elif mode == "conclusion": | |
| if not re.match( | |
| r"^(in conclusion|to conclude|overall|this (paper|work|study))", | |
| summary, re.IGNORECASE | |
| ): | |
| summary = "In conclusion, " + summary[0].lower() + summary[1:] | |
| return f"[Conclusions]\n{summary}" | |
| elif mode == "abstract": | |
| if not re.match( | |
| r"^(this (paper|work|study)|we (propose|present|introduce)|in this)", | |
| summary, re.IGNORECASE | |
| ): | |
| summary = "This work " + summary[0].lower() + summary[1:] | |
| return f"[Abstract]\n{summary}" | |
| else: # standard / technical_overview | |
| return f"[{label}]\n{summary}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Translation (non-English output) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Map our language names β deep-translator language codes | |
| _LANG_CODES = { | |
| "english": "en", | |
| "spanish": "es", | |
| "french": "fr", | |
| "german": "de", | |
| "italian": "it", | |
| "portuguese": "pt", | |
| "chinese": "zh-CN", | |
| "japanese": "ja", | |
| "korean": "ko", | |
| "arabic": "ar", | |
| "hindi": "hi", | |
| "russian": "ru", | |
| "turkish": "tr", | |
| "vietnamese": "vi", | |
| "thai": "th", | |
| } | |
| def translate_summary(text: str, target_language: str) -> str: | |
| """ | |
| Translate the English summary to the target language using deep-translator. | |
| Returns original text if target is English or translation fails. | |
| """ | |
| lang = target_language.lower().strip() | |
| if lang in ("english", "en", ""): | |
| return text | |
| target_code = _LANG_CODES.get(lang) | |
| if not target_code: | |
| logger.warning(f"[Translation] Unknown language: {lang}, skipping translation") | |
| return text | |
| try: | |
| from deep_translator import GoogleTranslator | |
| # Split into chunks β€ 4500 chars (API limit) | |
| chunks = _chunk_for_translation(text, max_chars=4500) | |
| translated_chunks = [] | |
| translator = GoogleTranslator(source="en", target=target_code) | |
| for chunk in chunks: | |
| translated_chunks.append(translator.translate(chunk)) | |
| result = " ".join(translated_chunks) | |
| logger.info(f"[Translation] Translated to {lang} ({len(result)} chars)") | |
| return result | |
| except ImportError: | |
| logger.warning("[Translation] deep-translator not installed. pip install deep-translator") | |
| return text | |
| except Exception as e: | |
| logger.error(f"[Translation] Failed: {e}") | |
| return text # Graceful fallback to English | |
| def _chunk_for_translation(text: str, max_chars: int = 4500) -> List[str]: | |
| """Split text into chunks that fit within the translation API limit.""" | |
| if len(text) <= max_chars: | |
| return [text] | |
| sentences = re.split(r"(?<=[.!?\n])\s+", text) | |
| chunks, current = [], "" | |
| for s in sentences: | |
| if len(current) + len(s) + 1 > max_chars: | |
| if current: | |
| chunks.append(current.strip()) | |
| current = s | |
| else: | |
| current = (current + " " + s).strip() | |
| if current: | |
| chunks.append(current.strip()) | |
| return chunks | |