| import os |
| from typing import Dict, Any |
|
|
| |
| |
| |
|
|
| |
| LLM_BACKEND = "hf_api" |
|
|
| |
| HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "") |
| HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" |
|
|
| |
| LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl") |
| DEVICE = os.getenv("DEVICE", "auto") |
|
|
| |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") |
| OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4") |
|
|
| |
| MAX_TOKENS_PER_REQUEST = 100 |
| LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3")) |
| LLM_TIMEOUT = 25 |
|
|
| |
| |
| |
|
|
| MAX_CHUNK_TOKENS = 2000 |
| OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150")) |
| TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base") |
|
|
| |
| |
| |
|
|
| MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3")) |
| MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50")) |
| MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100")) |
|
|
| |
| QUALITY_EXCELLENT = 0.8 |
| QUALITY_GOOD = 0.6 |
| QUALITY_FAIR = 0.4 |
|
|
| |
| |
| |
|
|
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) |
| SUPPORTED_FORMATS = [".docx", ".pdf"] |
| MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10")) |
|
|
| |
| |
| |
|
|
| OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs") |
| CSV_FILENAME = "transcript_analysis.csv" |
| PDF_FILENAME = "transcript_report.pdf" |
|
|
| |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| |
| |
|
|
| DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true" |
| VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true" |
| LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log") |
|
|
| |
| |
| |
|
|
| |
| ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true" |
| CACHE_DIR = os.getenv("CACHE_DIR", "./.cache") |
|
|
| |
| ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true" |
| MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4")) |
|
|
| |
| |
| |
|
|
| BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews. |
| |
| Your task is to extract structured, actionable insights from interview transcripts. |
| |
| Core Principles: |
| - Focus on factual, verifiable medical information |
| - Distinguish between speaker roles accurately |
| - Filter out pleasantries, disclaimers, and off-topic content |
| - Extract specific medical terms, dosages, and treatment details |
| - Identify patterns and clinical reasoning |
| - Maintain objectivity and clinical accuracy |
| """ |
|
|
| HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """ |
| Healthcare Professional Analysis Focus: |
| - Prescribing patterns and medication choices |
| - Diagnostic reasoning and clinical decision-making |
| - Treatment protocols and guidelines referenced |
| - Peer perspectives on efficacy and safety |
| - Barriers to treatment or adoption |
| - Off-label uses or emerging practices |
| |
| Extract and structure: |
| 1. Diagnoses mentioned with context |
| 2. Prescriptions with dosage, frequency, and rationale |
| 3. Treatment strategies and their justifications |
| 4. Clinical guidelines or studies referenced |
| 5. Challenges or barriers discussed |
| 6. Key clinical insights or pearls |
| """ |
|
|
| PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """ |
| Patient Interview Analysis Focus: |
| - Symptom descriptions and severity |
| - Treatment experiences and outcomes |
| - Side effects and tolerability |
| - Quality of life impacts |
| - Adherence challenges and enablers |
| - Emotional and psychological factors |
| - Healthcare system interactions |
| |
| Extract and structure: |
| 1. Primary symptoms with duration and severity |
| 2. Current and past treatments |
| 3. Treatment effectiveness and satisfaction |
| 4. Side effects experienced |
| 5. Concerns and unmet needs |
| 6. Quality of life impacts |
| 7. Support systems and resources |
| """ |
|
|
| SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends. |
| |
| Focus on: |
| - Frequency analysis (how many interviewees mentioned X?) |
| - Common patterns and themes |
| - Consensus and disagreements |
| - Statistical insights (percentages, distributions) |
| - Actionable recommendations for stakeholders |
| |
| Provide: |
| 1. Quantitative summary (X% mentioned Y) |
| 2. Key trends and patterns |
| 3. Notable outliers or unique insights |
| 4. Actionable recommendations |
| 5. Data gaps or areas needing follow-up |
| """ |
|
|
| |
| |
| |
|
|
| VALIDATION_CONFIG = { |
| "min_word_ratio": 0.3, |
| "max_repetition_ratio": 1.5, |
| "min_sentences": 3, |
| "check_errors": True, |
| "check_gibberish": True |
| } |
|
|
| |
| |
| |
|
|
| DASHBOARD_CONFIG = { |
| "figure_size": (14, 10), |
| "dpi": 100, |
| "style": "default", |
| "top_n_items": 8, |
| "color_scheme": { |
| "primary": "#3498db", |
| "secondary": "#2ecc71", |
| "accent": "#e74c3c", |
| "warning": "#f39c12" |
| } |
| } |
|
|
| |
| |
| |
|
|
| def get_config() -> Dict[str, Any]: |
| """Return all configuration as a dictionary""" |
| return { |
| "llm": { |
| "backend": LLM_BACKEND, |
| "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL, |
| "max_tokens": MAX_TOKENS_PER_REQUEST, |
| "temperature": LLM_TEMPERATURE, |
| "timeout": LLM_TIMEOUT |
| }, |
| "chunking": { |
| "max_tokens": MAX_CHUNK_TOKENS, |
| "overlap": OVERLAP_TOKENS |
| }, |
| "quality": { |
| "min_score": MIN_QUALITY_SCORE, |
| "min_words": MIN_WORD_COUNT |
| }, |
| "files": { |
| "max_size_mb": MAX_FILE_SIZE_MB, |
| "max_per_batch": MAX_FILES_PER_BATCH, |
| "supported": SUPPORTED_FORMATS |
| }, |
| "output": { |
| "directory": OUTPUT_DIR, |
| "csv": CSV_FILENAME, |
| "pdf": PDF_FILENAME |
| }, |
| "debug": DEBUG_MODE, |
| "caching": ENABLE_CACHING, |
| "parallel": ENABLE_PARALLEL_PROCESSING |
| } |
|
|
|
|
| def print_config(): |
| """Print current configuration""" |
| config = get_config() |
| print("=" * 60) |
| print("TRANSCRIPTORAI CONFIGURATION") |
| print("=" * 60) |
| for section, settings in config.items(): |
| print(f"\n{section.upper()}:") |
| for key, value in settings.items(): |
| print(f" {key}: {value}") |
| print("=" * 60) |
|
|
|
|
| def validate_config() -> bool: |
| """Validate configuration settings""" |
| issues = [] |
| |
| |
| if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN: |
| issues.append("HF API selected but HUGGINGFACE_TOKEN not set") |
| |
| if LLM_BACKEND == "openai" and not OPENAI_API_KEY: |
| issues.append("OpenAI selected but OPENAI_API_KEY not set") |
| |
| |
| if not os.path.exists(OUTPUT_DIR): |
| try: |
| os.makedirs(OUTPUT_DIR) |
| except: |
| issues.append(f"Cannot create output directory: {OUTPUT_DIR}") |
| |
| |
| if MAX_CHUNK_TOKENS < 500: |
| issues.append("MAX_CHUNK_TOKENS too small (< 500)") |
| |
| if MAX_TOKENS_PER_REQUEST < 100: |
| issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)") |
| |
| if issues: |
| print("Configuration Issues:") |
| for issue in issues: |
| print(f" - {issue}") |
| return False |
| |
| return True |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print_config() |
| if validate_config(): |
| print("\n✓ Configuration valid") |
| else: |
| print("\n✗ Configuration has issues") |