Spaces:

rrizwan98
/

honcho-api

Runtime error

rrizwan98

Honcho self-hosted deployment for HF Spaces

66227af about 2 months ago

7.98 kB

	import datetime
	import logging
	from io import BytesIO
	from typing import Any, Protocol

	from fastapi import UploadFile
	from nanoid import generate as generate_nanoid
	from sqlalchemy import Integer, select
	from sqlalchemy.ext.asyncio import AsyncSession

	from src import schemas
	from src.config import settings
	from src.exceptions import (
	FileProcessingError,
	UnsupportedFileTypeError,
	ValidationException,
	)
	from src.schemas import Message

	logger = logging.getLogger(__name__)


	class FileProcessor(Protocol):
	async def extract_text(self, content: bytes) -> str: ...
	def supports_file_type(self, content_type: str) -> bool: ...


	class PDFProcessor:
	def supports_file_type(self, content_type: str) -> bool:
	return content_type == "application/pdf"

	async def extract_text(self, content: bytes) -> str:
	import pdfplumber

	with pdfplumber.open(BytesIO(content)) as pdf_reader:
	text_parts: list[str] = []
	for page_num, page in enumerate(pdf_reader.pages):
	text = page.extract_text()
	if text and text.strip():
	text_parts.append(f"[Page {page_num + 1}]\n{text}")
	return "\n\n".join(text_parts)


	class TextProcessor:
	def supports_file_type(self, content_type: str) -> bool:
	return content_type.startswith("text/")

	async def extract_text(self, content: bytes) -> str:
	# Try different encodings
	for encoding in ["utf-8", "utf-16", "latin-1"]:
	try:
	return content.decode(encoding)
	except UnicodeDecodeError:
	continue
	raise ValueError("Could not decode text file")


	class JSONProcessor:
	def supports_file_type(self, content_type: str) -> bool:
	return content_type == "application/json"

	async def extract_text(self, content: bytes) -> str:
	import json

	try:
	decoded_content = content.decode("utf-8")
	except UnicodeDecodeError as exc:
	raise ValidationException("JSON uploads must be UTF-8 encoded") from exc

	if not decoded_content.strip():
	return ""

	try:
	data = json.loads(decoded_content)
	except json.JSONDecodeError as exc:
	raise ValidationException("Uploaded JSON is invalid") from exc

	# Convert JSON to readable text format
	return json.dumps(data, ensure_ascii=False)


	class FileProcessingService:
	def __init__(self):
	self.processors: list[FileProcessor] = [
	PDFProcessor(),
	TextProcessor(),
	JSONProcessor(),
	# Add more processors as needed
	]

	async def extract_text_from_upload(self, file: UploadFile) -> str:
	"""Extract text from uploaded file without saving to disk."""
	content = await file.read()

	# Reset file position in case it's needed again
	await file.seek(0)

	processor = self._get_processor(file.content_type or "")
	if not processor:
	raise UnsupportedFileTypeError(
	f"Unsupported file type: {file.content_type}. Supported types: {[p.__class__.__name__ for p in self.processors]}"
	)

	return await processor.extract_text(content)

	def _get_processor(self, content_type: str) -> FileProcessor \| None:
	for processor in self.processors:
	if processor.supports_file_type(content_type):
	return processor
	return None


	def split_text_into_chunks(text: str, max_chars: int = 49500) -> list[str]:
	"""Split text into chunks that fit within message limits."""
	if len(text) <= max_chars:
	return [text]

	chunks: list[str] = []
	current_pos = 0

	while current_pos < len(text):
	# Try to break at paragraph boundaries first
	end_pos = current_pos + max_chars

	if end_pos >= len(text):
	chunks.append(text[current_pos:])
	break

	# Look for good break points (paragraph, sentence, word)
	break_pos = end_pos
	for delimiter in ["\n\n", "\n", ". ", " "]:
	last_delimiter = text.rfind(delimiter, current_pos, end_pos)
	if last_delimiter > current_pos:
	break_pos = last_delimiter + len(delimiter)
	break

	chunks.append(text[current_pos:break_pos])
	current_pos = break_pos

	return chunks


	async def get_file_messages(
	db: AsyncSession,
	workspace_name: str,
	file_id: str,
	session_name: str \| None = None,
	) -> list[Message]:
	"""Get all messages for a specific document, ordered by chunk_index."""
	from sqlalchemy import and_, func

	from src.models import Message

	query = select(Message).where(
	and_(
	Message.workspace_name == workspace_name,
	func.jsonb_extract_path_text(Message.internal_metadata, "file_id")
	== file_id,
	)
	)

	if session_name:
	query = query.where(Message.session_name == session_name)

	# Order by chunk_index
	query = query.order_by(
	func.jsonb_extract_path_text(Message.internal_metadata, "chunk_index").cast(
	Integer
	)
	)

	result = await db.execute(query)
	return list(result.scalars().all())


	async def process_file_uploads_for_messages(
	file: UploadFile,
	peer_id: str,
	max_chars: int = settings.MAX_MESSAGE_SIZE,
	metadata: dict[str, Any] \| None = None,
	configuration: schemas.MessageConfiguration \| None = None,
	created_at: datetime.datetime \| None = None,
	) -> list[dict[str, Any]]:
	"""
	Process an uploaded file and prepare message creation data.

	This function extracts text from a file, splits it into chunks, and prepares
	the data needed to create messages.

	Args:
	file: Uploaded file to process
	peer_id: ID of the peer creating the messages
	max_chars: Maximum characters per message chunk
	metadata: Optional metadata to associate with all messages created from this file
	configuration: Optional configuration to associate with all messages created from this file
	created_at: Optional created_at timestamp to use for all messages created from this file

	Returns:
	List of dictionaries containing message_create and file_metadata

	Raises:
	HTTPException: If file processing fails
	"""

	file_processor = FileProcessingService()
	all_message_data: list[dict[str, Any]] = []

	# Process the uploaded file
	extracted_text = await file_processor.extract_text_from_upload(file)

	# Split into chunks and create messages
	chunks = split_text_into_chunks(extracted_text, max_chars=max_chars)
	file_id = generate_nanoid()

	for i, chunk in enumerate(chunks):
	# Build message content properly handling empty files
	message_content = chunk or ""

	# Create message with optional metadata, configuration, and created_at
	message_create = schemas.MessageCreate(
	content=message_content,
	peer_id=peer_id,
	metadata=metadata,
	configuration=configuration,
	created_at=created_at,
	)

	# Store file metadata separately to add to internal_metadata later
	file_metadata = {
	"file_id": file_id,
	"filename": file.filename,
	"chunk_index": i,
	"total_chunks": len(chunks),
	"original_file_size": file.size,
	"content_type": file.content_type,
	"chunk_character_range": [
	i * max_chars,
	min((i + 1) * max_chars, len(extracted_text)),
	],
	}

	all_message_data.append(
	{
	"message_create": message_create,
	"file_metadata": file_metadata,
	}
	)

	if not all_message_data:
	raise FileProcessingError()

	return all_message_data