Spaces:

Dev-ks04
/

contexto-api

Running

Dev-ks04

feat: Contexto FastAPI backend - intent-aware summarization engine

39028c9 1 day ago

21 kB

	"""
	Intent-Aware Engine (v2)
	==========================
	Drives genuinely different output per intent by combining:

	1. Pre-filtering – sentence-level keyword scoring + section detection
	2. Prompt shaping – T5/BART-compatible instruction prefix per intent
	3. Level handling – executive / brief / detailed / bullets control length & format
	4. Post-processing – format, deduplicate, and label the raw model output
	5. Translation – lightweight deep-translator for non-English output

	Intent catalogue
	----------------
	technical_overview High-level: what the system/paper does, key technologies
	detailed_analysis Deep-dive: methods, architecture, design decisions, equations
	methodology HOW: algorithm, pipeline, dataset, training, evaluation setup
	results WHAT was found: numbers, benchmarks, metrics, comparisons
	conclusion So what: conclusions, implications, limitations, future work
	abstract Compact academic abstract: problem → approach → result
	"""

	import re
	import logging
	from typing import List, Tuple, Dict, Optional

	logger = logging.getLogger(__name__)


	# ─────────────────────────────────────────────────────────────────────────────
	# Summary-level configuration
	# ─────────────────────────────────────────────────────────────────────────────

	LEVEL_CONFIG = {
	"executive": {"min_length": 20, "max_length": 60, "num_beams": 3, "label": "Executive Summary"},
	"brief": {"min_length": 50, "max_length": 130, "num_beams": 3, "label": "Brief Summary"},
	"detailed": {"min_length": 100,"max_length": 280, "num_beams": 4, "label": "Detailed Summary"},
	"bullets": {"min_length": 60, "max_length": 200, "num_beams": 4, "label": "Bullet Points"},
	}
	_DEFAULT_LEVEL = LEVEL_CONFIG["brief"]

	# ─────────────────────────────────────────────────────────────────────────────
	# Quality-mode → generation params
	# ─────────────────────────────────────────────────────────────────────────────

	QUALITY_CONFIG = {
	"speed": {"num_beams": 2, "no_repeat_ngram_size": 2, "length_penalty": 1.0},
	"balanced": {"num_beams": 4, "no_repeat_ngram_size": 3, "length_penalty": 1.2},
	"quality": {"num_beams": 6, "no_repeat_ngram_size": 3, "length_penalty": 1.5},
	}
	_DEFAULT_QUALITY = QUALITY_CONFIG["balanced"]

	# ─────────────────────────────────────────────────────────────────────────────
	# Intent configuration table
	# ─────────────────────────────────────────────────────────────────────────────

	INTENT_CONFIG: Dict[str, dict] = {
	"technical_overview": {
	"t5_prefix": "summarize the key technical contributions, system design, and core technologies: ",
	"focus_keywords": [
	"propose", "present", "introduce", "system", "framework", "architecture",
	"approach", "model", "method", "technique", "design", "develop", "built",
	"based on", "consists of", "leverages", "uses", "overview", "we propose",
	"this paper", "this work", "we present", "contribution", "novel",
	],
	"section_priority": ["abstract", "introduction", "overview", "system", "contribution"],
	"label": "Technical Overview",
	"postprocess": "standard",
	},

	"detailed_analysis": {
	"t5_prefix": (
	"provide a comprehensive technical analysis covering the architecture, "
	"implementation details, design choices, and key algorithms: "
	),
	"focus_keywords": [
	"layer", "encoder", "decoder", "attention", "network", "module",
	"equation", "formula", "parameter", "trained", "fine-tune", "embedding",
	"vector", "dimension", "kernel", "function", "optimization", "gradient",
	"loss", "activation", "transformer", "convolutional", "recurrent",
	"algorithm", "complexity", "implementation", "configuration",
	"mechanism", "architecture", "block", "head", "weight", "hidden",
	],
	"section_priority": ["method", "approach", "model", "architecture", "implementation", "detail"],
	"label": "Detailed Technical Analysis",
	"postprocess": "detailed",
	},

	"methodology": {
	"t5_prefix": (
	"summarize the methodology, experimental setup, dataset used, "
	"training procedure, evaluation metrics, and pipeline: "
	),
	"focus_keywords": [
	"dataset", "corpus", "training", "evaluation", "experiment", "setup",
	"pipeline", "procedure", "step", "process", "baseline", "split",
	"validation", "test", "train", "fine-tun", "pre-train", "annotation",
	"collect", "sample", "batch", "epoch", "hyperparameter", "learning rate",
	"benchmark", "protocol", "augmentation", "split", "fold", "cross-validation",
	],
	"section_priority": ["method", "methodology", "experiment", "setup", "data", "training"],
	"label": "Methodology & Experimental Setup",
	"postprocess": "methodology",
	},

	"results": {
	"t5_prefix": (
	"summarize the experimental results, key quantitative findings, "
	"performance metrics, and comparisons with baselines: "
	),
	"focus_keywords": [
	"result", "performance", "accuracy", "score", "f1", "bleu", "rouge",
	"precision", "recall", "achieves", "outperform", "beats", "surpass",
	"improves", "gain", "%", "percent", "state-of-the-art", "sota",
	"baseline", "comparison", "ablation", "demonstrate", "show",
	"table", "figure", "metric", "evaluation", "benchmark", "error rate",
	"recall", "map", "ndcg", "auc", "roc",
	],
	"section_priority": ["result", "experiment", "evaluation", "performance", "finding", "ablation"],
	"label": "Results & Findings",
	"postprocess": "results",
	},

	"conclusion": {
	"t5_prefix": (
	"summarize the conclusions, key takeaways, limitations of the work, "
	"and potential future research directions: "
	),
	"focus_keywords": [
	"conclude", "conclusion", "summary", "finding", "contribution",
	"implication", "limitation", "future", "direction", "work",
	"suggest", "recommend", "open problem", "challenge", "prospect",
	"overall", "in summary", "in conclusion", "we show", "we demonstrate",
	"our work", "this paper", "insight", "takeaway", "impact",
	],
	"section_priority": ["conclusion", "summary", "discussion", "future", "limitation"],
	"label": "Conclusions & Implications",
	"postprocess": "conclusion",
	},

	"abstract": {
	"t5_prefix": (
	"write a concise abstract covering the problem statement, proposed approach, "
	"key results, and significance of the work: "
	),
	"focus_keywords": [
	"propose", "present", "address", "problem", "challenge", "solution",
	"achieve", "result", "outperform", "significant", "novel",
	"contribution", "state-of-the-art", "demonstrate", "this paper",
	"we present", "we propose", "this work", "task", "application",
	],
	"section_priority": ["abstract", "introduction", "conclusion"],
	"label": "Abstract",
	"postprocess": "abstract",
	},
	}

	_DEFAULT_INTENT = INTENT_CONFIG["technical_overview"]


	# ─────────────────────────────────────────────────────────────────────────────
	# Public config accessors
	# ─────────────────────────────────────────────────────────────────────────────

	def get_intent_config(intent: str) -> dict:
	return INTENT_CONFIG.get(intent, _DEFAULT_INTENT)

	def get_level_config(level: str) -> dict:
	return LEVEL_CONFIG.get(level, _DEFAULT_LEVEL)

	def get_quality_config(quality: str) -> dict:
	return QUALITY_CONFIG.get(quality, _DEFAULT_QUALITY)


	# ─────────────────────────────────────────────────────────────────────────────
	# Sentence utilities
	# ─────────────────────────────────────────────────────────────────────────────

	def _split_sentences(text: str) -> List[str]:
	raw = re.split(r"(?<=[.!?])\s+", text.strip())
	return [s.strip() for s in raw if len(s.strip()) > 25]


	def _score_sentences(sentences: List[str], keywords: List[str]) -> List[Tuple[str, float]]:
	kw_lower = [k.lower() for k in keywords]
	scored = []
	for sent in sentences:
	sl = sent.lower()
	score = sum(1.5 if len(kw) > 6 else 1.0 for kw in kw_lower if kw in sl)
	scored.append((sent, score))
	return scored


	def _detect_sections(text: str) -> Dict[str, str]:
	heading_re = re.compile(
	r"^(#{1,4}\s+.+\|(?:abstract\|introduction\|background\|related work\|"
	r"methodology\|method\|approach\|experiment\|result\|evaluation\|"
	r"discussion\|conclusion\|limitation\|future work\|references)"
	r"[:\s]*$)",
	re.IGNORECASE \| re.MULTILINE,
	)
	parts = heading_re.split(text)
	sections: Dict[str, str] = {}
	i = 0
	current = "body"
	while i < len(parts):
	part = parts[i].strip()
	if heading_re.match(part):
	current = part.lower().strip("#").strip()
	i += 1
	if i < len(parts):
	sections[current] = parts[i].strip()
	i += 1
	else:
	sections.setdefault(current, "")
	sections[current] += " " + part
	i += 1
	return {k: v.strip() for k, v in sections.items() if v.strip()}


	# ─────────────────────────────────────────────────────────────────────────────
	# Core pre-filtering
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_intent_relevant_text(
	document: str,
	intent: str,
	max_chars: int = 3000,
	) -> str:
	"""
	Select the most intent-relevant sentences from the document.

	Strategy:
	1. Detect named sections and prioritise intent-relevant ones.
	2. Score every sentence against intent keywords.
	3. Take top-scoring sentences (preserving original order).
	"""
	cfg = get_intent_config(intent)
	keywords: List[str] = cfg["focus_keywords"]
	section_priority: List[str] = cfg["section_priority"]

	sections = _detect_sections(document)

	# Pull high-priority section text first
	priority_text = ""
	for priority in section_priority:
	for sec_name, sec_text in sections.items():
	if priority in sec_name.lower():
	priority_text += " " + sec_text
	break

	all_text = (priority_text + " " + document).strip()
	sentences = _split_sentences(all_text)

	if not sentences:
	return document[:max_chars]

	scored = _score_sentences(sentences, keywords)
	sorted_scored = sorted(scored, key=lambda x: x[1], reverse=True)

	# Take at least 1/3 of sentences, never fewer than 6
	top_n = max(6, len(sentences) // 3)
	# Include all sentences that scored > 0 up to top_n
	top_set = {s for s, sc in sorted_scored[:top_n] if sc > 0}

	if not top_set:
	# Nothing scored — return the best section text or full text
	return (priority_text or all_text)[:max_chars]

	# Restore original order
	ordered = [s for s in sentences if s in top_set]
	result = " ".join(ordered)

	if len(result) > max_chars:
	result = result[:max_chars].rsplit(" ", 1)[0]

	logger.info(
	f"[IntentEngine] intent={intent}, total={len(sentences)}, "
	f"selected={len(ordered)}, chars={len(result)}"
	)
	return result or all_text[:max_chars]


	# ─────────────────────────────────────────────────────────────────────────────
	# T5 input construction
	# ─────────────────────────────────────────────────────────────────────────────

	def build_t5_input(filtered_text: str, intent: str) -> str:
	"""Combine the intent-specific prefix with the pre-filtered text."""
	prefix = get_intent_config(intent)["t5_prefix"]
	return f"{prefix}{filtered_text}"


	# ─────────────────────────────────────────────────────────────────────────────
	# Post-processing
	# ─────────────────────────────────────────────────────────────────────────────

	def _clean_raw(text: str) -> str:
	"""Basic cleanup: collapse whitespace and deduplicate sentences."""
	text = re.sub(r"\s{2,}", " ", text.strip())
	seen: set = set()
	deduped = []
	for sent in re.split(r"(?<=[.!?])\s+", text):
	key = sent.lower().strip()
	if key and key not in seen:
	seen.add(key)
	deduped.append(sent.strip())
	return " ".join(deduped)


	def _to_bullets(text: str) -> str:
	"""Convert a paragraph into a bullet-point list."""
	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 15]
	if not sentences:
	return text
	return "\n".join(f"• {s}" for s in sentences)


	def _to_numbered(text: str) -> str:
	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 15]
	if len(sentences) < 2:
	return text
	return "\n".join(f"{i+1}. {s}" for i, s in enumerate(sentences))


	def postprocess_summary(raw_summary: str, intent: str, summary_level: str = "brief") -> str:
	"""
	Apply intent + level specific post-processing to the raw model output.

	Modes (from intent):
	standard → label + clean paragraph
	detailed → label + longer clean paragraph
	methodology → numbered steps
	results → metric highlighting with ►
	conclusion → "In conclusion, …" framing
	abstract → "This work …" academic framing

	Summary level overrides:
	bullets → always convert to • bullet list
	executive → trim to first 1-2 sentences
	"""
	cfg = get_intent_config(intent)
	mode = cfg.get("postprocess", "standard")
	label = cfg["label"]

	summary = _clean_raw(raw_summary)

	# ── Bullets level overrides everything → convert to bullet list ──
	if summary_level == "bullets":
	summary = _to_bullets(summary)
	return f"[{label}]\n{summary}"

	# ── Executive level → very short, first 2 sentences max ──
	if summary_level == "executive":
	sentences = [s for s in re.split(r"(?<=[.!?])\s+", summary) if s.strip()]
	summary = " ".join(sentences[:2])
	return f"[{label}]\n{summary}"

	# ── Intent-specific formatting ──
	if mode == "detailed":
	if summary and not summary[-1] in ".!?":
	summary += "."
	return f"[Detailed Analysis]\n{summary}"

	elif mode == "methodology":
	numbered = _to_numbered(summary)
	return f"[Methodology]\n{numbered}"

	elif mode == "results":
	# Highlight numbers and metric names
	highlighted = re.sub(
	r"(\d+\.?\d\s%\|\b\d+\.\d+\b\|\b(?:accuracy\|f1\|bleu\|rouge\|score\|precision\|recall\|auc)[^.]\d[^.]\.?)",
	lambda m: f"► {m.group(0).strip()}",
	summary,
	flags=re.IGNORECASE,
	)
	return f"[Results & Findings]\n{highlighted}"

	elif mode == "conclusion":
	if not re.match(
	r"^(in conclusion\|to conclude\|overall\|this (paper\|work\|study))",
	summary, re.IGNORECASE
	):
	summary = "In conclusion, " + summary[0].lower() + summary[1:]
	return f"[Conclusions]\n{summary}"

	elif mode == "abstract":
	if not re.match(
	r"^(this (paper\|work\|study)\|we (propose\|present\|introduce)\|in this)",
	summary, re.IGNORECASE
	):
	summary = "This work " + summary[0].lower() + summary[1:]
	return f"[Abstract]\n{summary}"

	else: # standard / technical_overview
	return f"[{label}]\n{summary}"


	# ─────────────────────────────────────────────────────────────────────────────
	# Translation (non-English output)
	# ─────────────────────────────────────────────────────────────────────────────

	# Map our language names → deep-translator language codes
	_LANG_CODES = {
	"english": "en",
	"spanish": "es",
	"french": "fr",
	"german": "de",
	"italian": "it",
	"portuguese": "pt",
	"chinese": "zh-CN",
	"japanese": "ja",
	"korean": "ko",
	"arabic": "ar",
	"hindi": "hi",
	"russian": "ru",
	"turkish": "tr",
	"vietnamese": "vi",
	"thai": "th",
	}


	def translate_summary(text: str, target_language: str) -> str:
	"""
	Translate the English summary to the target language using deep-translator.
	Returns original text if target is English or translation fails.
	"""
	lang = target_language.lower().strip()
	if lang in ("english", "en", ""):
	return text

	target_code = _LANG_CODES.get(lang)
	if not target_code:
	logger.warning(f"[Translation] Unknown language: {lang}, skipping translation")
	return text

	try:
	from deep_translator import GoogleTranslator
	# Split into chunks ≤ 4500 chars (API limit)
	chunks = _chunk_for_translation(text, max_chars=4500)
	translated_chunks = []
	translator = GoogleTranslator(source="en", target=target_code)
	for chunk in chunks:
	translated_chunks.append(translator.translate(chunk))
	result = " ".join(translated_chunks)
	logger.info(f"[Translation] Translated to {lang} ({len(result)} chars)")
	return result
	except ImportError:
	logger.warning("[Translation] deep-translator not installed. pip install deep-translator")
	return text
	except Exception as e:
	logger.error(f"[Translation] Failed: {e}")
	return text # Graceful fallback to English


	def _chunk_for_translation(text: str, max_chars: int = 4500) -> List[str]:
	"""Split text into chunks that fit within the translation API limit."""
	if len(text) <= max_chars:
	return [text]
	sentences = re.split(r"(?<=[.!?\n])\s+", text)
	chunks, current = [], ""
	for s in sentences:
	if len(current) + len(s) + 1 > max_chars:
	if current:
	chunks.append(current.strip())
	current = s
	else:
	current = (current + " " + s).strip()
	if current:
	chunks.append(current.strip())
	return chunks