Spaces:

Dev-ks04
/

contexto-api

Running

Dev-ks04

feat: Contexto FastAPI backend - intent-aware summarization engine

39028c9 2 days ago

4.79 kB

	"""
	Lightweight keyword extraction
	"""

	import logging
	from typing import List, Dict
	import re
	from collections import Counter

	logger = logging.getLogger(__name__)


	class KeywordExtractor:
	"""Extract important keywords from documents (lightweight, no heavy NLP)."""

	def __init__(self):
	"""Initialize keyword extractor."""
	# Common stopwords
	self.stopwords = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
	'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'been', 'be',
	'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
	'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
	'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
	'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'more',
	'most', 'other', 'some', 'any', 'such', 'no', 'nor', 'not', 'only',
	'same', 'so', 'than', 'too', 'very', 'just', 'about', 'also', 'our'
	}

	def extract_keywords(
	self,
	text: str,
	top_k: int = 10,
	min_length: int = 3
	) -> List[str]:
	"""
	Extract top keywords from text (simple TF approach).

	Args:
	text: Input text
	top_k: Number of keywords to extract
	min_length: Minimum keyword length

	Returns:
	List of top keywords
	"""
	# Clean and lowercase
	text = text.lower()

	# Remove special characters and extra spaces
	words = re.findall(r'\b[a-z_]+\b', text)

	# Filter stopwords and short words
	filtered_words = [
	w for w in words
	if w not in self.stopwords and len(w) >= min_length
	]

	# Count frequencies
	word_freq = Counter(filtered_words)

	# Get top keywords
	keywords = [word for word, _ in word_freq.most_common(top_k)]

	return keywords

	def extract_phrases(
	self,
	text: str,
	top_k: int = 5,
	phrase_len: int = 2
	) -> List[str]:
	"""
	Extract key phrases (multi-word terms).

	Args:
	text: Input text
	top_k: Number of phrases to extract
	phrase_len: Length of phrases (2-3 words)

	Returns:
	List of top phrases
	"""
	# Split into sentences
	sentences = re.split(r'[.!?]+', text)

	phrases = []
	for sentence in sentences:
	words = re.findall(r'\b[a-z_]+\b', sentence.lower())
	# Extract n-grams
	for i in range(len(words) - phrase_len + 1):
	phrase = ' '.join(words[i:i+phrase_len])
	# Skip if contains stopwords
	if not any(w in self.stopwords for w in words[i:i+phrase_len]):
	phrases.append(phrase)

	# Count frequencies
	phrase_freq = Counter(phrases)

	# Get top phrases
	top_phrases = [phrase for phrase, _ in phrase_freq.most_common(top_k)]

	return top_phrases

	def extract_all(
	self,
	text: str,
	keywords_k: int = 10,
	phrases_k: int = 5
	) -> Dict[str, List[str]]:
	"""
	Extract both keywords and phrases.

	Args:
	text: Input text
	keywords_k: Number of keywords
	phrases_k: Number of phrases

	Returns:
	Dictionary with keywords and phrases
	"""
	return {
	'keywords': self.extract_keywords(text, top_k=keywords_k),
	'key_phrases': self.extract_phrases(text, top_k=phrases_k)
	}

	def score_keywords(
	self,
	text: str,
	keywords: List[str]
	) -> Dict[str, float]:
	"""
	Score keywords based on frequency and position.

	Args:
	text: Input text
	keywords: List of keywords to score

	Returns:
	Dictionary with keyword scores
	"""
	text_lower = text.lower()
	scores = {}

	for keyword in keywords:
	# Count frequency
	count = text_lower.count(keyword)

	# Check position (higher score if in beginning)
	position_score = 1.0
	if text_lower.find(keyword) < len(text) / 4:
	position_score = 1.5

	# Calculate TF-IDF-like score
	score = (count * position_score) / (len(text.split()) / 100)
	scores[keyword] = min(score, 10.0) # Cap at 10

	return scores