Spaces:

empirenexus
/

TranscriptWriting

Paused

App Files Files Community

jmisak commited on Oct 19, 2025

Commit

a233900

verified ·

1 Parent(s): faacab1

Update config.py

Browse files

Files changed (1) hide show

config.py +282 -282

config.py CHANGED Viewed

@@ -1,283 +1,283 @@
-import os
-from typing import Dict, Any
-# ============================================================================
-# LLM CONFIGURATION
-# ============================================================================
-# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
-LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
-# Hugging Face Configuration
-HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
-HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
-# Local Model Configuration
-LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
-DEVICE = os.getenv("DEVICE", "auto")  # "auto", "cpu", "cuda", "mps"
-# OpenAI Configuration (if using OpenAI)
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
-# LLM Parameters
-MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
-LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
-LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
-# ============================================================================
-# CHUNKING CONFIGURATION
-# ============================================================================
-MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
-OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
-TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
-# ============================================================================
-# QUALITY THRESHOLDS
-# ============================================================================
-MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
-MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
-MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
-# Quality grade thresholds
-QUALITY_EXCELLENT = 0.8
-QUALITY_GOOD = 0.6
-QUALITY_FAIR = 0.4
-# ============================================================================
-# FILE PROCESSING CONFIGURATION
-# ============================================================================
-MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
-SUPPORTED_FORMATS = [".docx", ".pdf"]
-MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
-# ============================================================================
-# OUTPUT CONFIGURATION
-# ============================================================================
-OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
-CSV_FILENAME = "transcript_analysis.csv"
-PDF_FILENAME = "transcript_report.pdf"
-# Ensure output directory exists
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-# ============================================================================
-# DEBUG AND LOGGING
-# ============================================================================
-DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
-VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
-LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
-# ============================================================================
-# ADVANCED SETTINGS
-# ============================================================================
-# Cache extracted text to avoid re-processing
-ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
-CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
-# Parallel processing
-ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
-MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
-# ============================================================================
-# SYSTEM PROMPTS
-# ============================================================================
-BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
-Your task is to extract structured, actionable insights from interview transcripts.
-Core Principles:
-- Focus on factual, verifiable medical information
-- Distinguish between speaker roles accurately
-- Filter out pleasantries, disclaimers, and off-topic content
-- Extract specific medical terms, dosages, and treatment details
-- Identify patterns and clinical reasoning
-- Maintain objectivity and clinical accuracy
-"""
-HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
-Healthcare Professional Analysis Focus:
-- Prescribing patterns and medication choices
-- Diagnostic reasoning and clinical decision-making
-- Treatment protocols and guidelines referenced
-- Peer perspectives on efficacy and safety
-- Barriers to treatment or adoption
-- Off-label uses or emerging practices
-Extract and structure:
-1. Diagnoses mentioned with context
-2. Prescriptions with dosage, frequency, and rationale
-3. Treatment strategies and their justifications
-4. Clinical guidelines or studies referenced
-5. Challenges or barriers discussed
-6. Key clinical insights or pearls
-"""
-PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
-Patient Interview Analysis Focus:
-- Symptom descriptions and severity
-- Treatment experiences and outcomes
-- Side effects and tolerability
-- Quality of life impacts
-- Adherence challenges and enablers
-- Emotional and psychological factors
-- Healthcare system interactions
-Extract and structure:
-1. Primary symptoms with duration and severity
-2. Current and past treatments
-3. Treatment effectiveness and satisfaction
-4. Side effects experienced
-5. Concerns and unmet needs
-6. Quality of life impacts
-7. Support systems and resources
-"""
-SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
-Focus on:
-- Frequency analysis (how many interviewees mentioned X?)
-- Common patterns and themes
-- Consensus and disagreements
-- Statistical insights (percentages, distributions)
-- Actionable recommendations for stakeholders
-Provide:
-1. Quantitative summary (X% mentioned Y)
-2. Key trends and patterns
-3. Notable outliers or unique insights
-4. Actionable recommendations
-5. Data gaps or areas needing follow-up
-"""
-# ============================================================================
-# VALIDATION SETTINGS
-# ============================================================================
-VALIDATION_CONFIG = {
-    "min_word_ratio": 0.3,
-    "max_repetition_ratio": 1.5,
-    "min_sentences": 3,
-    "check_errors": True,
-    "check_gibberish": True
-}
-# ============================================================================
-# DASHBOARD SETTINGS
-# ============================================================================
-DASHBOARD_CONFIG = {
-    "figure_size": (14, 10),
-    "dpi": 100,
-    "style": "default",
-    "top_n_items": 8,
-    "color_scheme": {
-        "primary": "#3498db",
-        "secondary": "#2ecc71",
-        "accent": "#e74c3c",
-        "warning": "#f39c12"
-    }
-}
-# ============================================================================
-# HELPER FUNCTIONS
-# ============================================================================
-def get_config() -> Dict[str, Any]:
-    """Return all configuration as a dictionary"""
-    return {
-        "llm": {
-            "backend": LLM_BACKEND,
-            "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
-            "max_tokens": MAX_TOKENS_PER_REQUEST,
-            "temperature": LLM_TEMPERATURE,
-            "timeout": LLM_TIMEOUT
-        },
-        "chunking": {
-            "max_tokens": MAX_CHUNK_TOKENS,
-            "overlap": OVERLAP_TOKENS
-        },
-        "quality": {
-            "min_score": MIN_QUALITY_SCORE,
-            "min_words": MIN_WORD_COUNT
-        },
-        "files": {
-            "max_size_mb": MAX_FILE_SIZE_MB,
-            "max_per_batch": MAX_FILES_PER_BATCH,
-            "supported": SUPPORTED_FORMATS
-        },
-        "output": {
-            "directory": OUTPUT_DIR,
-            "csv": CSV_FILENAME,
-            "pdf": PDF_FILENAME
-        },
-        "debug": DEBUG_MODE,
-        "caching": ENABLE_CACHING,
-        "parallel": ENABLE_PARALLEL_PROCESSING
-    }
-def print_config():
-    """Print current configuration"""
-    config = get_config()
-    print("=" * 60)
-    print("TRANSCRIPTORAI CONFIGURATION")
-    print("=" * 60)
-    for section, settings in config.items():
-        print(f"\n{section.upper()}:")
-        for key, value in settings.items():
-            print(f"  {key}: {value}")
-    print("=" * 60)
-def validate_config() -> bool:
-    """Validate configuration settings"""
-    issues = []
-    # Check LLM configuration
-    if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
-        issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
-    if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
-        issues.append("OpenAI selected but OPENAI_API_KEY not set")
-    # Check paths exist
-    if not os.path.exists(OUTPUT_DIR):
-        try:
-            os.makedirs(OUTPUT_DIR)
-        except:
-            issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
-    # Check reasonable values
-    if MAX_CHUNK_TOKENS < 500:
-        issues.append("MAX_CHUNK_TOKENS too small (< 500)")
-    if MAX_TOKENS_PER_REQUEST < 100:
-        issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
-    if issues:
-        print("Configuration Issues:")
-        for issue in issues:
-            print(f"  - {issue}")
-        return False
-    return True
-# ============================================================================
-# INITIALIZATION
-# ============================================================================
-if __name__ == "__main__":
-    print_config()
-    if validate_config():
-        print("\n✓ Configuration valid")
-    else:
         print("\n✗ Configuration has issues")

+import os
+from typing import Dict, Any
+# ============================================================================
+# LLM CONFIGURATION
+# ============================================================================
+# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
+LLM_BACKEND = "hf_api"  # Forced for HF Spaces
+# Hugging Face Configuration
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
+HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Lighter for Spaces
+# Local Model Configuration
+LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
+DEVICE = os.getenv("DEVICE", "auto")  # "auto", "cpu", "cuda", "mps"
+# OpenAI Configuration (if using OpenAI)
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
+# LLM Parameters
+MAX_TOKENS_PER_REQUEST = 100  # Faster for Spaces
+LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
+LLM_TIMEOUT = 25  # Spaces timeout limit
+# ============================================================================
+# CHUNKING CONFIGURATION
+# ============================================================================
+MAX_CHUNK_TOKENS = 2000  # Lighter for Spaces
+OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
+TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
+# ============================================================================
+# QUALITY THRESHOLDS
+# ============================================================================
+MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
+MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
+MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
+# Quality grade thresholds
+QUALITY_EXCELLENT = 0.8
+QUALITY_GOOD = 0.6
+QUALITY_FAIR = 0.4
+# ============================================================================
+# FILE PROCESSING CONFIGURATION
+# ============================================================================
+MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
+SUPPORTED_FORMATS = [".docx", ".pdf"]
+MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
+# ============================================================================
+# OUTPUT CONFIGURATION
+# ============================================================================
+OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
+CSV_FILENAME = "transcript_analysis.csv"
+PDF_FILENAME = "transcript_report.pdf"
+# Ensure output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ============================================================================
+# DEBUG AND LOGGING
+# ============================================================================
+DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
+VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
+LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
+# ============================================================================
+# ADVANCED SETTINGS
+# ============================================================================
+# Cache extracted text to avoid re-processing
+ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
+CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
+# Parallel processing
+ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
+MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
+# ============================================================================
+# SYSTEM PROMPTS
+# ============================================================================
+BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
+Your task is to extract structured, actionable insights from interview transcripts.
+Core Principles:
+- Focus on factual, verifiable medical information
+- Distinguish between speaker roles accurately
+- Filter out pleasantries, disclaimers, and off-topic content
+- Extract specific medical terms, dosages, and treatment details
+- Identify patterns and clinical reasoning
+- Maintain objectivity and clinical accuracy
+"""
+HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
+Healthcare Professional Analysis Focus:
+- Prescribing patterns and medication choices
+- Diagnostic reasoning and clinical decision-making
+- Treatment protocols and guidelines referenced
+- Peer perspectives on efficacy and safety
+- Barriers to treatment or adoption
+- Off-label uses or emerging practices
+Extract and structure:
+1. Diagnoses mentioned with context
+2. Prescriptions with dosage, frequency, and rationale
+3. Treatment strategies and their justifications
+4. Clinical guidelines or studies referenced
+5. Challenges or barriers discussed
+6. Key clinical insights or pearls
+"""
+PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
+Patient Interview Analysis Focus:
+- Symptom descriptions and severity
+- Treatment experiences and outcomes
+- Side effects and tolerability
+- Quality of life impacts
+- Adherence challenges and enablers
+- Emotional and psychological factors
+- Healthcare system interactions
+Extract and structure:
+1. Primary symptoms with duration and severity
+2. Current and past treatments
+3. Treatment effectiveness and satisfaction
+4. Side effects experienced
+5. Concerns and unmet needs
+6. Quality of life impacts
+7. Support systems and resources
+"""
+SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
+Focus on:
+- Frequency analysis (how many interviewees mentioned X?)
+- Common patterns and themes
+- Consensus and disagreements
+- Statistical insights (percentages, distributions)
+- Actionable recommendations for stakeholders
+Provide:
+1. Quantitative summary (X% mentioned Y)
+2. Key trends and patterns
+3. Notable outliers or unique insights
+4. Actionable recommendations
+5. Data gaps or areas needing follow-up
+"""
+# ============================================================================
+# VALIDATION SETTINGS
+# ============================================================================
+VALIDATION_CONFIG = {
+    "min_word_ratio": 0.3,
+    "max_repetition_ratio": 1.5,
+    "min_sentences": 3,
+    "check_errors": True,
+    "check_gibberish": True
+}
+# ============================================================================
+# DASHBOARD SETTINGS
+# ============================================================================
+DASHBOARD_CONFIG = {
+    "figure_size": (14, 10),
+    "dpi": 100,
+    "style": "default",
+    "top_n_items": 8,
+    "color_scheme": {
+        "primary": "#3498db",
+        "secondary": "#2ecc71",
+        "accent": "#e74c3c",
+        "warning": "#f39c12"
+    }
+}
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def get_config() -> Dict[str, Any]:
+    """Return all configuration as a dictionary"""
+    return {
+        "llm": {
+            "backend": LLM_BACKEND,
+            "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
+            "max_tokens": MAX_TOKENS_PER_REQUEST,
+            "temperature": LLM_TEMPERATURE,
+            "timeout": LLM_TIMEOUT
+        },
+        "chunking": {
+            "max_tokens": MAX_CHUNK_TOKENS,
+            "overlap": OVERLAP_TOKENS
+        },
+        "quality": {
+            "min_score": MIN_QUALITY_SCORE,
+            "min_words": MIN_WORD_COUNT
+        },
+        "files": {
+            "max_size_mb": MAX_FILE_SIZE_MB,
+            "max_per_batch": MAX_FILES_PER_BATCH,
+            "supported": SUPPORTED_FORMATS
+        },
+        "output": {
+            "directory": OUTPUT_DIR,
+            "csv": CSV_FILENAME,
+            "pdf": PDF_FILENAME
+        },
+        "debug": DEBUG_MODE,
+        "caching": ENABLE_CACHING,
+        "parallel": ENABLE_PARALLEL_PROCESSING
+    }
+def print_config():
+    """Print current configuration"""
+    config = get_config()
+    print("=" * 60)
+    print("TRANSCRIPTORAI CONFIGURATION")
+    print("=" * 60)
+    for section, settings in config.items():
+        print(f"\n{section.upper()}:")
+        for key, value in settings.items():
+            print(f"  {key}: {value}")
+    print("=" * 60)
+def validate_config() -> bool:
+    """Validate configuration settings"""
+    issues = []
+    # Check LLM configuration
+    if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
+        issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
+    if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
+        issues.append("OpenAI selected but OPENAI_API_KEY not set")
+    # Check paths exist
+    if not os.path.exists(OUTPUT_DIR):
+        try:
+            os.makedirs(OUTPUT_DIR)
+        except:
+            issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
+    # Check reasonable values
+    if MAX_CHUNK_TOKENS < 500:
+        issues.append("MAX_CHUNK_TOKENS too small (< 500)")
+    if MAX_TOKENS_PER_REQUEST < 100:
+        issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
+    if issues:
+        print("Configuration Issues:")
+        for issue in issues:
+            print(f"  - {issue}")
+        return False
+    return True
+# ============================================================================
+# INITIALIZATION
+# ============================================================================
+if __name__ == "__main__":
+    print_config()
+    if validate_config():
+        print("\n✓ Configuration valid")
+    else:
         print("\n✗ Configuration has issues")