Spaces:

Dev-ks04
/

contexto-api

Running

Dev-ks04

feat: Contexto FastAPI backend - intent-aware summarization engine

39028c9 2 days ago

2.75 kB

	import os
	import json
	import logging
	from typing import List, Dict, Any
	from pathlib import Path


	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)

	logger = logging.getLogger(__name__)


	def load_config(config_path: str) -> Dict[str, Any]:
	"""
	Load configuration from JSON file.

	Args:
	config_path: Path to configuration file

	Returns:
	Configuration dictionary
	"""
	try:
	with open(config_path, 'r') as f:
	config = json.load(f)
	logger.info(f"Configuration loaded from {config_path}")
	return config
	except FileNotFoundError:
	logger.error(f"Configuration file not found: {config_path}")
	return {}


	def save_config(config: Dict[str, Any], config_path: str) -> None:
	"""
	Save configuration to JSON file.

	Args:
	config: Configuration dictionary
	config_path: Path to save configuration
	"""
	Path(config_path).parent.mkdir(parents=True, exist_ok=True)
	with open(config_path, 'w') as f:
	json.dump(config, f, indent=4)
	logger.info(f"Configuration saved to {config_path}")


	def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
	"""
	Split text into overlapping chunks.

	Args:
	text: Input text to chunk
	chunk_size: Size of each chunk
	overlap: Overlap between consecutive chunks

	Returns:
	List of text chunks
	"""
	chunks = []
	step = chunk_size - overlap

	for i in range(0, len(text), step):
	chunk = text[i:i + chunk_size]
	if len(chunk) > 0:
	chunks.append(chunk)

	logger.info(f"Text split into {len(chunks)} chunks")
	return chunks


	def merge_chunks(chunks: List[str], overlap: int = 50) -> str:
	"""
	Merge overlapping text chunks back into single text.

	Args:
	chunks: List of text chunks
	overlap: Original overlap size

	Returns:
	Merged text
	"""
	if not chunks:
	return ""

	merged = chunks[0]
	for chunk in chunks[1:]:
	# Remove overlapping portion
	merged += chunk[overlap:]

	return merged


	def get_file_size(file_path: str) -> int:
	"""Get file size in bytes."""
	return os.path.getsize(file_path)


	def count_tokens_approximate(text: str) -> int:
	"""
	Approximate token count using word-based heuristic.
	For more accurate counting, use tokenizer from transformers library.

	Args:
	text: Input text

	Returns:
	Approximate token count
	"""
	# Rough estimate: 1 token ≈ 4 characters
	return len(text) // 4