| import os
|
| from typing import Dict, Any
|
|
|
|
|
|
|
|
|
|
|
|
|
| LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
|
|
|
|
|
| HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
|
| HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
|
|
|
|
|
| LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
|
| DEVICE = os.getenv("DEVICE", "auto")
|
|
|
|
|
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
|
|
|
|
|
| MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
|
| LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
|
| LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
|
|
|
|
|
|
|
|
|
|
|
| MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
|
| OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
|
| TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
|
|
|
|
|
|
|
|
|
|
|
| MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
|
| MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
|
| MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
|
|
|
|
|
| QUALITY_EXCELLENT = 0.8
|
| QUALITY_GOOD = 0.6
|
| QUALITY_FAIR = 0.4
|
|
|
|
|
|
|
|
|
|
|
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
|
| SUPPORTED_FORMATS = [".docx", ".pdf"]
|
| MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
|
|
|
|
|
|
|
|
|
|
|
| OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
|
| CSV_FILENAME = "transcript_analysis.csv"
|
| PDF_FILENAME = "transcript_report.pdf"
|
|
|
|
|
| os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
| DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
|
| VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
|
| LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
|
|
|
|
|
|
|
|
|
|
|
|
|
| ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
|
| CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
|
|
|
|
|
| ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
|
| MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
|
|
|
|
|
|
|
|
|
|
|
| BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
|
|
|
| Your task is to extract structured, actionable insights from interview transcripts.
|
|
|
| Core Principles:
|
| - Focus on factual, verifiable medical information
|
| - Distinguish between speaker roles accurately
|
| - Filter out pleasantries, disclaimers, and off-topic content
|
| - Extract specific medical terms, dosages, and treatment details
|
| - Identify patterns and clinical reasoning
|
| - Maintain objectivity and clinical accuracy
|
| """
|
|
|
| HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
|
| Healthcare Professional Analysis Focus:
|
| - Prescribing patterns and medication choices
|
| - Diagnostic reasoning and clinical decision-making
|
| - Treatment protocols and guidelines referenced
|
| - Peer perspectives on efficacy and safety
|
| - Barriers to treatment or adoption
|
| - Off-label uses or emerging practices
|
|
|
| Extract and structure:
|
| 1. Diagnoses mentioned with context
|
| 2. Prescriptions with dosage, frequency, and rationale
|
| 3. Treatment strategies and their justifications
|
| 4. Clinical guidelines or studies referenced
|
| 5. Challenges or barriers discussed
|
| 6. Key clinical insights or pearls
|
| """
|
|
|
| PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
|
| Patient Interview Analysis Focus:
|
| - Symptom descriptions and severity
|
| - Treatment experiences and outcomes
|
| - Side effects and tolerability
|
| - Quality of life impacts
|
| - Adherence challenges and enablers
|
| - Emotional and psychological factors
|
| - Healthcare system interactions
|
|
|
| Extract and structure:
|
| 1. Primary symptoms with duration and severity
|
| 2. Current and past treatments
|
| 3. Treatment effectiveness and satisfaction
|
| 4. Side effects experienced
|
| 5. Concerns and unmet needs
|
| 6. Quality of life impacts
|
| 7. Support systems and resources
|
| """
|
|
|
| SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
|
|
|
| Focus on:
|
| - Frequency analysis (how many interviewees mentioned X?)
|
| - Common patterns and themes
|
| - Consensus and disagreements
|
| - Statistical insights (percentages, distributions)
|
| - Actionable recommendations for stakeholders
|
|
|
| Provide:
|
| 1. Quantitative summary (X% mentioned Y)
|
| 2. Key trends and patterns
|
| 3. Notable outliers or unique insights
|
| 4. Actionable recommendations
|
| 5. Data gaps or areas needing follow-up
|
| """
|
|
|
|
|
|
|
|
|
|
|
| VALIDATION_CONFIG = {
|
| "min_word_ratio": 0.3,
|
| "max_repetition_ratio": 1.5,
|
| "min_sentences": 3,
|
| "check_errors": True,
|
| "check_gibberish": True
|
| }
|
|
|
|
|
|
|
|
|
|
|
| DASHBOARD_CONFIG = {
|
| "figure_size": (14, 10),
|
| "dpi": 100,
|
| "style": "default",
|
| "top_n_items": 8,
|
| "color_scheme": {
|
| "primary": "#3498db",
|
| "secondary": "#2ecc71",
|
| "accent": "#e74c3c",
|
| "warning": "#f39c12"
|
| }
|
| }
|
|
|
|
|
|
|
|
|
|
|
| def get_config() -> Dict[str, Any]:
|
| """Return all configuration as a dictionary"""
|
| return {
|
| "llm": {
|
| "backend": LLM_BACKEND,
|
| "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
|
| "max_tokens": MAX_TOKENS_PER_REQUEST,
|
| "temperature": LLM_TEMPERATURE,
|
| "timeout": LLM_TIMEOUT
|
| },
|
| "chunking": {
|
| "max_tokens": MAX_CHUNK_TOKENS,
|
| "overlap": OVERLAP_TOKENS
|
| },
|
| "quality": {
|
| "min_score": MIN_QUALITY_SCORE,
|
| "min_words": MIN_WORD_COUNT
|
| },
|
| "files": {
|
| "max_size_mb": MAX_FILE_SIZE_MB,
|
| "max_per_batch": MAX_FILES_PER_BATCH,
|
| "supported": SUPPORTED_FORMATS
|
| },
|
| "output": {
|
| "directory": OUTPUT_DIR,
|
| "csv": CSV_FILENAME,
|
| "pdf": PDF_FILENAME
|
| },
|
| "debug": DEBUG_MODE,
|
| "caching": ENABLE_CACHING,
|
| "parallel": ENABLE_PARALLEL_PROCESSING
|
| }
|
|
|
|
|
| def print_config():
|
| """Print current configuration"""
|
| config = get_config()
|
| print("=" * 60)
|
| print("TRANSCRIPTORAI CONFIGURATION")
|
| print("=" * 60)
|
| for section, settings in config.items():
|
| print(f"\n{section.upper()}:")
|
| for key, value in settings.items():
|
| print(f" {key}: {value}")
|
| print("=" * 60)
|
|
|
|
|
| def validate_config() -> bool:
|
| """Validate configuration settings"""
|
| issues = []
|
|
|
|
|
| if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
|
| issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
|
|
|
| if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
|
| issues.append("OpenAI selected but OPENAI_API_KEY not set")
|
|
|
|
|
| if not os.path.exists(OUTPUT_DIR):
|
| try:
|
| os.makedirs(OUTPUT_DIR)
|
| except:
|
| issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
|
|
|
|
|
| if MAX_CHUNK_TOKENS < 500:
|
| issues.append("MAX_CHUNK_TOKENS too small (< 500)")
|
|
|
| if MAX_TOKENS_PER_REQUEST < 100:
|
| issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
|
|
|
| if issues:
|
| print("Configuration Issues:")
|
| for issue in issues:
|
| print(f" - {issue}")
|
| return False
|
|
|
| return True
|
|
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| print_config()
|
| if validate_config():
|
| print("\n✓ Configuration valid")
|
| else:
|
| print("\n✗ Configuration has issues") |