Spaces:
Sleeping
Sleeping
| import os | |
| from typing import Dict, Any | |
| # ============================================================================ | |
| # LLM CONFIGURATION | |
| # ============================================================================ | |
| # Choose LLM backend: "hf_api" (recommended), "local", or "openai" | |
| LLM_BACKEND = "hf_api" # Forced for HF Spaces | |
| # Hugging Face Configuration | |
| HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "") | |
| HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" # Lighter for Spaces | |
| # Local Model Configuration | |
| LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl") | |
| DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps" | |
| # OpenAI Configuration (if using OpenAI) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") | |
| OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4") | |
| # LLM Parameters | |
| MAX_TOKENS_PER_REQUEST = 100 # Faster for Spaces | |
| LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3")) | |
| LLM_TIMEOUT = 25 # Spaces timeout limit | |
| # ============================================================================ | |
| # CHUNKING CONFIGURATION | |
| # ============================================================================ | |
| MAX_CHUNK_TOKENS = 2000 # Lighter for Spaces | |
| OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150")) | |
| TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base") | |
| # ============================================================================ | |
| # QUALITY THRESHOLDS | |
| # ============================================================================ | |
| MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3")) | |
| MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50")) | |
| MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100")) | |
| # Quality grade thresholds | |
| QUALITY_EXCELLENT = 0.8 | |
| QUALITY_GOOD = 0.6 | |
| QUALITY_FAIR = 0.4 | |
| # ============================================================================ | |
| # FILE PROCESSING CONFIGURATION | |
| # ============================================================================ | |
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) | |
| SUPPORTED_FORMATS = [".docx", ".pdf"] | |
| MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10")) | |
| # ============================================================================ | |
| # OUTPUT CONFIGURATION | |
| # ============================================================================ | |
| OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs") | |
| CSV_FILENAME = "transcript_analysis.csv" | |
| PDF_FILENAME = "transcript_report.pdf" | |
| # Ensure output directory exists | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # ============================================================================ | |
| # DEBUG AND LOGGING | |
| # ============================================================================ | |
| DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true" | |
| VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true" | |
| LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log") | |
| # ============================================================================ | |
| # ADVANCED SETTINGS | |
| # ============================================================================ | |
| # Cache extracted text to avoid re-processing | |
| ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true" | |
| CACHE_DIR = os.getenv("CACHE_DIR", "./.cache") | |
| # Parallel processing | |
| ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true" | |
| MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4")) | |
| # ============================================================================ | |
| # SYSTEM PROMPTS | |
| # ============================================================================ | |
| BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews. | |
| Your task is to extract structured, actionable insights from interview transcripts. | |
| Core Principles: | |
| - Focus on factual, verifiable medical information | |
| - Distinguish between speaker roles accurately | |
| - Filter out pleasantries, disclaimers, and off-topic content | |
| - Extract specific medical terms, dosages, and treatment details | |
| - Identify patterns and clinical reasoning | |
| - Maintain objectivity and clinical accuracy | |
| """ | |
| HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """ | |
| Healthcare Professional Analysis Focus: | |
| - Prescribing patterns and medication choices | |
| - Diagnostic reasoning and clinical decision-making | |
| - Treatment protocols and guidelines referenced | |
| - Peer perspectives on efficacy and safety | |
| - Barriers to treatment or adoption | |
| - Off-label uses or emerging practices | |
| Extract and structure: | |
| 1. Diagnoses mentioned with context | |
| 2. Prescriptions with dosage, frequency, and rationale | |
| 3. Treatment strategies and their justifications | |
| 4. Clinical guidelines or studies referenced | |
| 5. Challenges or barriers discussed | |
| 6. Key clinical insights or pearls | |
| """ | |
| PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """ | |
| Patient Interview Analysis Focus: | |
| - Symptom descriptions and severity | |
| - Treatment experiences and outcomes | |
| - Side effects and tolerability | |
| - Quality of life impacts | |
| - Adherence challenges and enablers | |
| - Emotional and psychological factors | |
| - Healthcare system interactions | |
| Extract and structure: | |
| 1. Primary symptoms with duration and severity | |
| 2. Current and past treatments | |
| 3. Treatment effectiveness and satisfaction | |
| 4. Side effects experienced | |
| 5. Concerns and unmet needs | |
| 6. Quality of life impacts | |
| 7. Support systems and resources | |
| """ | |
| SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends. | |
| Focus on: | |
| - Frequency analysis (how many interviewees mentioned X?) | |
| - Common patterns and themes | |
| - Consensus and disagreements | |
| - Statistical insights (percentages, distributions) | |
| - Actionable recommendations for stakeholders | |
| Provide: | |
| 1. Quantitative summary (X% mentioned Y) | |
| 2. Key trends and patterns | |
| 3. Notable outliers or unique insights | |
| 4. Actionable recommendations | |
| 5. Data gaps or areas needing follow-up | |
| """ | |
| # ============================================================================ | |
| # VALIDATION SETTINGS | |
| # ============================================================================ | |
| VALIDATION_CONFIG = { | |
| "min_word_ratio": 0.3, | |
| "max_repetition_ratio": 1.5, | |
| "min_sentences": 3, | |
| "check_errors": True, | |
| "check_gibberish": True | |
| } | |
| # ============================================================================ | |
| # DASHBOARD SETTINGS | |
| # ============================================================================ | |
| DASHBOARD_CONFIG = { | |
| "figure_size": (14, 10), | |
| "dpi": 100, | |
| "style": "default", | |
| "top_n_items": 8, | |
| "color_scheme": { | |
| "primary": "#3498db", | |
| "secondary": "#2ecc71", | |
| "accent": "#e74c3c", | |
| "warning": "#f39c12" | |
| } | |
| } | |
| # ============================================================================ | |
| # HELPER FUNCTIONS | |
| # ============================================================================ | |
| def get_config() -> Dict[str, Any]: | |
| """Return all configuration as a dictionary""" | |
| return { | |
| "llm": { | |
| "backend": LLM_BACKEND, | |
| "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL, | |
| "max_tokens": MAX_TOKENS_PER_REQUEST, | |
| "temperature": LLM_TEMPERATURE, | |
| "timeout": LLM_TIMEOUT | |
| }, | |
| "chunking": { | |
| "max_tokens": MAX_CHUNK_TOKENS, | |
| "overlap": OVERLAP_TOKENS | |
| }, | |
| "quality": { | |
| "min_score": MIN_QUALITY_SCORE, | |
| "min_words": MIN_WORD_COUNT | |
| }, | |
| "files": { | |
| "max_size_mb": MAX_FILE_SIZE_MB, | |
| "max_per_batch": MAX_FILES_PER_BATCH, | |
| "supported": SUPPORTED_FORMATS | |
| }, | |
| "output": { | |
| "directory": OUTPUT_DIR, | |
| "csv": CSV_FILENAME, | |
| "pdf": PDF_FILENAME | |
| }, | |
| "debug": DEBUG_MODE, | |
| "caching": ENABLE_CACHING, | |
| "parallel": ENABLE_PARALLEL_PROCESSING | |
| } | |
| def print_config(): | |
| """Print current configuration""" | |
| config = get_config() | |
| print("=" * 60) | |
| print("TRANSCRIPTORAI CONFIGURATION") | |
| print("=" * 60) | |
| for section, settings in config.items(): | |
| print(f"\n{section.upper()}:") | |
| for key, value in settings.items(): | |
| print(f" {key}: {value}") | |
| print("=" * 60) | |
| def validate_config() -> bool: | |
| """Validate configuration settings""" | |
| issues = [] | |
| # Check LLM configuration | |
| if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN: | |
| issues.append("HF API selected but HUGGINGFACE_TOKEN not set") | |
| if LLM_BACKEND == "openai" and not OPENAI_API_KEY: | |
| issues.append("OpenAI selected but OPENAI_API_KEY not set") | |
| # Check paths exist | |
| if not os.path.exists(OUTPUT_DIR): | |
| try: | |
| os.makedirs(OUTPUT_DIR) | |
| except: | |
| issues.append(f"Cannot create output directory: {OUTPUT_DIR}") | |
| # Check reasonable values | |
| if MAX_CHUNK_TOKENS < 500: | |
| issues.append("MAX_CHUNK_TOKENS too small (< 500)") | |
| if MAX_TOKENS_PER_REQUEST < 100: | |
| issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)") | |
| if issues: | |
| print("Configuration Issues:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| return False | |
| return True | |
| # ============================================================================ | |
| # INITIALIZATION | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| print_config() | |
| if validate_config(): | |
| print("\n✓ Configuration valid") | |
| else: | |
| print("\n✗ Configuration has issues") |