File size: 2,859 Bytes
673435a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Transcript Model
"""

from datetime import datetime
from sqlalchemy import Column, Integer, String, Text, DateTime, ForeignKey, JSON, Float
from sqlalchemy.orm import relationship

from .base import Base
from ..core.security_encryption import EncryptedString


class Transcript(Base):
    """Transcript database model"""
    
    __tablename__ = "transcripts"
    
    id = Column(Integer, primary_key=True, index=True)
    audio_file_id = Column(Integer, ForeignKey("audio_files.id"), nullable=True, index=True)
    audio_file_id = Column(Integer, ForeignKey("audio_files.id"), nullable=True, index=True)
    # user_id removed (Auth disabled for portfolio)
    
    # Transcript content - ENCRYPTED
    raw_text = Column(EncryptedString(10000), nullable=True)  # Original transcription
    processed_text = Column(EncryptedString(10000), nullable=True)  # After NLP processing
    
    # Segments with timestamps and speaker info (JSON array)
    # Format: [{"start": 0.0, "end": 1.5, "text": "Hello", "speaker": "SPEAKER_1", "confidence": 0.95}]
    segments = Column(JSON, nullable=True)
    
    # Word-level timestamps (JSON array)
    # Format: [{"word": "hello", "start": 0.0, "end": 0.5, "confidence": 0.98}]
    words = Column(JSON, nullable=True)
    
    # Language info
    language = Column(String(10), nullable=True)  # Transcription language
    translation_language = Column(String(10), nullable=True)  # If translated
    translated_text = Column(Text, nullable=True)
    
    # NLP Analysis (Phase 2)
    sentiment = Column(JSON, nullable=True)  # {"overall": "positive", "score": 0.8, "segments": [...]}
    topics = Column(JSON, nullable=True)  # ["technology", "business"]
    keywords = Column(JSON, nullable=True)  # [{"word": "AI", "score": 0.9}]
    action_items = Column(JSON, nullable=True)  # [{"text": "Email John", "assignee": "Speaker 1"}]
    attendees = Column(JSON, nullable=True)  # ["Speaker 1", "Speaker 2"]
    summary = Column(EncryptedString(5000), nullable=True)  # ENCRYPTED
    
    # Metadata
    confidence = Column(Float, nullable=True)  # Overall confidence score
    duration = Column(Float, nullable=True)  # Audio duration in seconds
    word_count = Column(Integer, nullable=True)
    
    # Timestamps
    created_at = Column(DateTime, default=datetime.utcnow, index=True)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    
    # Relationships
    audio_file = relationship("AudioFile", back_populates="transcripts")
    audio_file = relationship("AudioFile", back_populates="transcripts")
    # user relationship removed
    
    def __repr__(self):
        preview = self.raw_text[:50] + "..." if self.raw_text and len(self.raw_text) > 50 else self.raw_text
        return f"<Transcript(id={self.id}, preview='{preview}')>"


# Import Float for confidence field