Spaces:
Sleeping
Sleeping
| """ | |
| Pydantic models for API request validation. | |
| This module defines the input models for the topic segmentation API, | |
| including transcript data, prompt requests, and validation constraints. | |
| """ | |
| from typing import List, Optional, Dict, Any, Union | |
| from datetime import datetime | |
| from enum import Enum | |
| from pydantic import BaseModel, Field, field_validator, ConfigDict | |
| from pydantic.types import PositiveInt, NonNegativeFloat | |
| from config.settings import AnthropicModel | |
| class SpeakerRole(str, Enum): | |
| """Enumeration of possible speaker roles in transcripts.""" | |
| INTERVIEWER = "interviewer" | |
| INTERVIEWEE = "interviewee" | |
| MODERATOR = "moderator" | |
| PARTICIPANT = "participant" | |
| CUSTOMER = "customer" | |
| AGENT = "agent" | |
| UNKNOWN = "unknown" | |
| class LanguageCode(str, Enum): | |
| """Supported language codes for processing.""" | |
| ENGLISH = "en" | |
| CZECH = "cs" | |
| SLOVAK = "sk" | |
| AUTO_DETECT = "auto" | |
| class PromptTemplate(str, Enum): | |
| """Pre-built prompt templates for different business scenarios.""" | |
| INTERVIEW = "interview" | |
| CUSTOMER_CALL = "customer_call" | |
| FEEDBACK_TICKET = "feedback_ticket" | |
| GENERAL_COMMENTARY = "general_commentary" | |
| CUSTOM = "custom" | |
| class TranscriptSentence(BaseModel): | |
| """ | |
| Individual sentence in a transcript with metadata. | |
| Represents a single sentence or utterance in the transcript | |
| with timing information, speaker details, and content. | |
| """ | |
| model_config = ConfigDict( | |
| str_strip_whitespace=True, | |
| validate_assignment=True, | |
| extra="forbid" | |
| ) | |
| # Core content | |
| text: str = Field( | |
| ..., | |
| min_length=1, | |
| max_length=2000, | |
| description="The actual text content of the sentence" | |
| ) | |
| # Indexing and identification | |
| sentence_index: PositiveInt = Field( | |
| ..., | |
| description="Sequential index of the sentence in the transcript (1-based)" | |
| ) | |
| # Timing information | |
| start_time: NonNegativeFloat = Field( | |
| ..., | |
| description="Start time of the sentence in seconds" | |
| ) | |
| end_time: NonNegativeFloat = Field( | |
| ..., | |
| description="End time of the sentence in seconds" | |
| ) | |
| # Speaker information | |
| speaker: str = Field( | |
| ..., | |
| min_length=1, | |
| max_length=100, | |
| description="Speaker identifier or name" | |
| ) | |
| speaker_role: Optional[SpeakerRole] = Field( | |
| default=SpeakerRole.UNKNOWN, | |
| description="Role of the speaker in the conversation" | |
| ) | |
| # Optional metadata | |
| confidence_score: Optional[float] = Field( | |
| default=None, | |
| ge=0.0, | |
| le=1.0, | |
| description="Transcription confidence score (0.0 to 1.0)" | |
| ) | |
| language: Optional[LanguageCode] = Field( | |
| default=None, | |
| description="Detected or specified language of the sentence" | |
| ) | |
| metadata: Optional[Dict[str, Any]] = Field( | |
| default=None, | |
| description="Additional metadata for the sentence" | |
| ) | |
| def validate_end_time_after_start(cls, v, info): | |
| """Ensure end_time is after start_time.""" | |
| if 'start_time' in info.data and v <= info.data['start_time']: | |
| raise ValueError('end_time must be greater than start_time') | |
| return v | |
| def validate_text_content(cls, v): | |
| """Validate text content is meaningful.""" | |
| if not v or v.isspace(): | |
| raise ValueError('text cannot be empty or only whitespace') | |
| return v.strip() | |
| class PromptConfiguration(BaseModel): | |
| """ | |
| Configuration for dynamic prompt injection. | |
| Allows customization of the topic extraction prompt | |
| while maintaining output format consistency. | |
| """ | |
| model_config = ConfigDict( | |
| str_strip_whitespace=True, | |
| validate_assignment=True, | |
| extra="forbid" | |
| ) | |
| # Template selection | |
| template: PromptTemplate = Field( | |
| default=PromptTemplate.INTERVIEW, | |
| description="Pre-built prompt template to use" | |
| ) | |
| # Custom prompt (when template is CUSTOM) | |
| custom_prompt: Optional[str] = Field( | |
| default=None, | |
| min_length=10, | |
| max_length=5000, | |
| description="Custom prompt text (required when template is CUSTOM)" | |
| ) | |
| # Language specification | |
| language: LanguageCode = Field( | |
| default=LanguageCode.AUTO_DETECT, | |
| description="Language for processing and prompts" | |
| ) | |
| # Business context | |
| business_domain: Optional[str] = Field( | |
| default=None, | |
| max_length=200, | |
| description="Business domain or industry context" | |
| ) | |
| # Additional instructions | |
| additional_instructions: Optional[str] = Field( | |
| default=None, | |
| max_length=1000, | |
| description="Additional instructions to append to the prompt" | |
| ) | |
| # Output format preferences | |
| include_confidence_scores: bool = Field( | |
| default=True, | |
| description="Whether to include confidence scores in output" | |
| ) | |
| include_speaker_analysis: bool = Field( | |
| default=True, | |
| description="Whether to include speaker-specific analysis" | |
| ) | |
| def validate_custom_prompt(cls, v, info): | |
| """Validate custom prompt when template is CUSTOM.""" | |
| if 'template' in info.data and info.data['template'] == PromptTemplate.CUSTOM: | |
| if not v: | |
| raise ValueError('custom_prompt is required when template is CUSTOM') | |
| return v | |
| class ModelConfiguration(BaseModel): | |
| """ | |
| Configuration for Anthropic model selection and parameters. | |
| Allows fine-tuning of model behavior for specific use cases. | |
| """ | |
| model_config = ConfigDict( | |
| validate_assignment=True, | |
| extra="forbid" | |
| ) | |
| # Model selection | |
| model: Optional[AnthropicModel] = Field( | |
| default=None, | |
| description="Specific Anthropic model to use (uses default if not specified)" | |
| ) | |
| # Generation parameters | |
| max_tokens: PositiveInt = Field( | |
| default=4000, | |
| le=8000, | |
| description="Maximum tokens to generate" | |
| ) | |
| temperature: float = Field( | |
| default=0.0, | |
| ge=0.0, | |
| le=1.0, | |
| description="Sampling temperature (0.0 for deterministic, 1.0 for creative)" | |
| ) | |
| # Fallback configuration | |
| enable_fallback: bool = Field( | |
| default=True, | |
| description="Whether to enable automatic fallback to alternative models" | |
| ) | |
| # Timeout settings | |
| timeout_seconds: PositiveInt = Field( | |
| default=300, | |
| le=600, | |
| description="Request timeout in seconds" | |
| ) | |
| class TranscriptRequest(BaseModel): | |
| """ | |
| Main request model for transcript topic segmentation. | |
| Contains the transcript data and all configuration options | |
| for processing and analysis. | |
| """ | |
| model_config = ConfigDict( | |
| str_strip_whitespace=True, | |
| validate_assignment=True, | |
| extra="forbid", | |
| protected_namespaces=() | |
| ) | |
| # Core transcript data | |
| sentences: List[TranscriptSentence] = Field( | |
| ..., | |
| min_length=1, | |
| max_length=1500, | |
| description="List of transcript sentences with metadata" | |
| ) | |
| # Request metadata | |
| transcript_id: Optional[str] = Field( | |
| default=None, | |
| max_length=100, | |
| description="Optional identifier for the transcript" | |
| ) | |
| transcript_title: Optional[str] = Field( | |
| default=None, | |
| max_length=200, | |
| description="Optional title or description of the transcript" | |
| ) | |
| # Processing configuration | |
| prompt_config: Optional[PromptConfiguration] = Field( | |
| default_factory=PromptConfiguration, | |
| description="Prompt configuration for topic extraction" | |
| ) | |
| model_config_override: Optional[ModelConfiguration] = Field( | |
| default=None, | |
| description="Model configuration overrides", | |
| alias="model_config_override" | |
| ) | |
| # Processing options | |
| merge_similar_topics: bool = Field( | |
| default=True, | |
| description="Whether to merge similar or duplicate topics" | |
| ) | |
| min_topic_length: PositiveInt = Field( | |
| default=2, | |
| le=10, | |
| description="Minimum number of sentences for a topic" | |
| ) | |
| include_metadata: bool = Field( | |
| default=True, | |
| description="Whether to include detailed metadata in response" | |
| ) | |
| # Client information | |
| client_info: Optional[Dict[str, str]] = Field( | |
| default=None, | |
| description="Optional client information for logging and analytics" | |
| ) | |
| def validate_sentences_order(cls, v): | |
| """Validate sentences are in chronological order.""" | |
| if len(v) < 2: | |
| return v | |
| for i in range(1, len(v)): | |
| if v[i].sentence_index <= v[i-1].sentence_index: | |
| raise ValueError(f'Sentences must be in ascending order by sentence_index') | |
| if v[i].start_time < v[i-1].start_time: | |
| raise ValueError(f'Sentences must be in chronological order by start_time') | |
| return v | |
| def validate_sentence_indices(cls, v): | |
| """Validate sentence indices are sequential.""" | |
| expected_indices = list(range(1, len(v) + 1)) | |
| actual_indices = [s.sentence_index for s in v] | |
| if actual_indices != expected_indices: | |
| raise ValueError(f'Sentence indices must be sequential starting from 1') | |
| return v | |
| class HealthCheckRequest(BaseModel): | |
| """ | |
| Request model for health check with optional detailed checks. | |
| """ | |
| model_config = ConfigDict(extra="forbid") | |
| include_model_health: bool = Field( | |
| default=False, | |
| description="Whether to include detailed model health checks" | |
| ) | |
| include_performance_stats: bool = Field( | |
| default=False, | |
| description="Whether to include performance statistics" | |
| ) | |
| class ModelSwitchRequest(BaseModel): | |
| """ | |
| Request model for switching the active model. | |
| """ | |
| model_config = ConfigDict(extra="forbid") | |
| model: AnthropicModel = Field( | |
| ..., | |
| description="Model to switch to" | |
| ) | |
| reason: Optional[str] = Field( | |
| default=None, | |
| max_length=200, | |
| description="Optional reason for the model switch" | |
| ) | |
| # Type aliases for convenience | |
| TranscriptData = List[TranscriptSentence] | |
| RequestMetadata = Dict[str, Union[str, int, float, bool]] |