Spaces:

empirenexus
/

TranscriptWriting

Sleeping

File size: 9,519 Bytes

import os
from typing import Dict, Any

# ============================================================================
# LLM CONFIGURATION
# ============================================================================

# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
LLM_BACKEND = "hf_api"  # Forced for HF Spaces

# Hugging Face Configuration
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Lighter for Spaces

# Local Model Configuration
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
DEVICE = os.getenv("DEVICE", "auto")  # "auto", "cpu", "cuda", "mps"

# OpenAI Configuration (if using OpenAI)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")

# LLM Parameters
MAX_TOKENS_PER_REQUEST = 100  # Faster for Spaces
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
LLM_TIMEOUT = 25  # Spaces timeout limit

# ============================================================================
# CHUNKING CONFIGURATION
# ============================================================================

MAX_CHUNK_TOKENS = 2000  # Lighter for Spaces
OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")

# ============================================================================
# QUALITY THRESHOLDS
# ============================================================================

MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))

# Quality grade thresholds
QUALITY_EXCELLENT = 0.8
QUALITY_GOOD = 0.6
QUALITY_FAIR = 0.4

# ============================================================================
# FILE PROCESSING CONFIGURATION
# ============================================================================

MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
SUPPORTED_FORMATS = [".docx", ".pdf"]
MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))

# ============================================================================
# OUTPUT CONFIGURATION
# ============================================================================

OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
CSV_FILENAME = "transcript_analysis.csv"
PDF_FILENAME = "transcript_report.pdf"

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================================================================
# DEBUG AND LOGGING
# ============================================================================

DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")

# ============================================================================
# ADVANCED SETTINGS
# ============================================================================

# Cache extracted text to avoid re-processing
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")

# Parallel processing
ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))

# ============================================================================
# SYSTEM PROMPTS
# ============================================================================

BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.

Your task is to extract structured, actionable insights from interview transcripts.

Core Principles:
- Focus on factual, verifiable medical information
- Distinguish between speaker roles accurately
- Filter out pleasantries, disclaimers, and off-topic content
- Extract specific medical terms, dosages, and treatment details
- Identify patterns and clinical reasoning
- Maintain objectivity and clinical accuracy
"""

HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
Healthcare Professional Analysis Focus:
- Prescribing patterns and medication choices
- Diagnostic reasoning and clinical decision-making
- Treatment protocols and guidelines referenced
- Peer perspectives on efficacy and safety
- Barriers to treatment or adoption
- Off-label uses or emerging practices

Extract and structure:
1. Diagnoses mentioned with context
2. Prescriptions with dosage, frequency, and rationale
3. Treatment strategies and their justifications
4. Clinical guidelines or studies referenced
5. Challenges or barriers discussed
6. Key clinical insights or pearls
"""

PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
Patient Interview Analysis Focus:
- Symptom descriptions and severity
- Treatment experiences and outcomes
- Side effects and tolerability
- Quality of life impacts
- Adherence challenges and enablers
- Emotional and psychological factors
- Healthcare system interactions

Extract and structure:
1. Primary symptoms with duration and severity
2. Current and past treatments
3. Treatment effectiveness and satisfaction
4. Side effects experienced
5. Concerns and unmet needs
6. Quality of life impacts
7. Support systems and resources
"""

SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.

Focus on:
- Frequency analysis (how many interviewees mentioned X?)
- Common patterns and themes
- Consensus and disagreements
- Statistical insights (percentages, distributions)
- Actionable recommendations for stakeholders

Provide:
1. Quantitative summary (X% mentioned Y)
2. Key trends and patterns
3. Notable outliers or unique insights
4. Actionable recommendations
5. Data gaps or areas needing follow-up
"""

# ============================================================================
# VALIDATION SETTINGS
# ============================================================================

VALIDATION_CONFIG = {
    "min_word_ratio": 0.3,
    "max_repetition_ratio": 1.5,
    "min_sentences": 3,
    "check_errors": True,
    "check_gibberish": True
}

# ============================================================================
# DASHBOARD SETTINGS
# ============================================================================

DASHBOARD_CONFIG = {
    "figure_size": (14, 10),
    "dpi": 100,
    "style": "default",
    "top_n_items": 8,
    "color_scheme": {
        "primary": "#3498db",
        "secondary": "#2ecc71",
        "accent": "#e74c3c",
        "warning": "#f39c12"
    }
}

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def get_config() -> Dict[str, Any]:
    """Return all configuration as a dictionary"""
    return {
        "llm": {
            "backend": LLM_BACKEND,
            "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
            "max_tokens": MAX_TOKENS_PER_REQUEST,
            "temperature": LLM_TEMPERATURE,
            "timeout": LLM_TIMEOUT
        },
        "chunking": {
            "max_tokens": MAX_CHUNK_TOKENS,
            "overlap": OVERLAP_TOKENS
        },
        "quality": {
            "min_score": MIN_QUALITY_SCORE,
            "min_words": MIN_WORD_COUNT
        },
        "files": {
            "max_size_mb": MAX_FILE_SIZE_MB,
            "max_per_batch": MAX_FILES_PER_BATCH,
            "supported": SUPPORTED_FORMATS
        },
        "output": {
            "directory": OUTPUT_DIR,
            "csv": CSV_FILENAME,
            "pdf": PDF_FILENAME
        },
        "debug": DEBUG_MODE,
        "caching": ENABLE_CACHING,
        "parallel": ENABLE_PARALLEL_PROCESSING
    }


def print_config():
    """Print current configuration"""
    config = get_config()
    print("=" * 60)
    print("TRANSCRIPTORAI CONFIGURATION")
    print("=" * 60)
    for section, settings in config.items():
        print(f"\n{section.upper()}:")
        for key, value in settings.items():
            print(f"  {key}: {value}")
    print("=" * 60)


def validate_config() -> bool:
    """Validate configuration settings"""
    issues = []
    
    # Check LLM configuration
    if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
        issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
    
    if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
        issues.append("OpenAI selected but OPENAI_API_KEY not set")
    
    # Check paths exist
    if not os.path.exists(OUTPUT_DIR):
        try:
            os.makedirs(OUTPUT_DIR)
        except:
            issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
    
    # Check reasonable values
    if MAX_CHUNK_TOKENS < 500:
        issues.append("MAX_CHUNK_TOKENS too small (< 500)")
    
    if MAX_TOKENS_PER_REQUEST < 100:
        issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
    
    if issues:
        print("Configuration Issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    
    return True


# ============================================================================
# INITIALIZATION
# ============================================================================

if __name__ == "__main__":
    print_config()
    if validate_config():
        print("\n✓ Configuration valid")
    else:
        print("\n✗ Configuration has issues")