Spaces:

empirenexus
/

TranscriptWriting

Paused

App Files Files Community

TranscriptWriting / config.py

jmisak

Upload 23 files

54c99ad verified 6 months ago

raw

history blame

9.86 kB

	import os
	from typing import Dict, Any

	# ============================================================================
	# LLM CONFIGURATION
	# ============================================================================

	# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
	LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")

	# Hugging Face Configuration
	HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
	HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")

	# Local Model Configuration
	LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
	DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps"

	# OpenAI Configuration (if using OpenAI)
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
	OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")

	# LLM Parameters
	MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
	LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
	LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))

	# ============================================================================
	# CHUNKING CONFIGURATION
	# ============================================================================

	MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
	OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
	TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")

	# ============================================================================
	# QUALITY THRESHOLDS
	# ============================================================================

	MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
	MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
	MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))

	# Quality grade thresholds
	QUALITY_EXCELLENT = 0.8
	QUALITY_GOOD = 0.6
	QUALITY_FAIR = 0.4

	# ============================================================================
	# FILE PROCESSING CONFIGURATION
	# ============================================================================

	MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
	SUPPORTED_FORMATS = [".docx", ".pdf"]
	MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))

	# ============================================================================
	# OUTPUT CONFIGURATION
	# ============================================================================

	OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
	CSV_FILENAME = "transcript_analysis.csv"
	PDF_FILENAME = "transcript_report.pdf"

	# Ensure output directory exists
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# ============================================================================
	# DEBUG AND LOGGING
	# ============================================================================

	DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
	VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
	LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")

	# ============================================================================
	# ADVANCED SETTINGS
	# ============================================================================

	# Cache extracted text to avoid re-processing
	ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
	CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")

	# Parallel processing
	ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
	MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))

	# ============================================================================
	# SYSTEM PROMPTS
	# ============================================================================

	BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.

	Your task is to extract structured, actionable insights from interview transcripts.

	Core Principles:
	- Focus on factual, verifiable medical information
	- Distinguish between speaker roles accurately
	- Filter out pleasantries, disclaimers, and off-topic content
	- Extract specific medical terms, dosages, and treatment details
	- Identify patterns and clinical reasoning
	- Maintain objectivity and clinical accuracy
	"""

	HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
	Healthcare Professional Analysis Focus:
	- Prescribing patterns and medication choices
	- Diagnostic reasoning and clinical decision-making
	- Treatment protocols and guidelines referenced
	- Peer perspectives on efficacy and safety
	- Barriers to treatment or adoption
	- Off-label uses or emerging practices

	Extract and structure:
	1. Diagnoses mentioned with context
	2. Prescriptions with dosage, frequency, and rationale
	3. Treatment strategies and their justifications
	4. Clinical guidelines or studies referenced
	5. Challenges or barriers discussed
	6. Key clinical insights or pearls
	"""

	PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
	Patient Interview Analysis Focus:
	- Symptom descriptions and severity
	- Treatment experiences and outcomes
	- Side effects and tolerability
	- Quality of life impacts
	- Adherence challenges and enablers
	- Emotional and psychological factors
	- Healthcare system interactions

	Extract and structure:
	1. Primary symptoms with duration and severity
	2. Current and past treatments
	3. Treatment effectiveness and satisfaction
	4. Side effects experienced
	5. Concerns and unmet needs
	6. Quality of life impacts
	7. Support systems and resources
	"""

	SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.

	Focus on:
	- Frequency analysis (how many interviewees mentioned X?)
	- Common patterns and themes
	- Consensus and disagreements
	- Statistical insights (percentages, distributions)
	- Actionable recommendations for stakeholders

	Provide:
	1. Quantitative summary (X% mentioned Y)
	2. Key trends and patterns
	3. Notable outliers or unique insights
	4. Actionable recommendations
	5. Data gaps or areas needing follow-up
	"""

	# ============================================================================
	# VALIDATION SETTINGS
	# ============================================================================

	VALIDATION_CONFIG = {
	"min_word_ratio": 0.3,
	"max_repetition_ratio": 1.5,
	"min_sentences": 3,
	"check_errors": True,
	"check_gibberish": True
	}

	# ============================================================================
	# DASHBOARD SETTINGS
	# ============================================================================

	DASHBOARD_CONFIG = {
	"figure_size": (14, 10),
	"dpi": 100,
	"style": "default",
	"top_n_items": 8,
	"color_scheme": {
	"primary": "#3498db",
	"secondary": "#2ecc71",
	"accent": "#e74c3c",
	"warning": "#f39c12"
	}
	}

	# ============================================================================
	# HELPER FUNCTIONS
	# ============================================================================

	def get_config() -> Dict[str, Any]:
	"""Return all configuration as a dictionary"""
	return {
	"llm": {
	"backend": LLM_BACKEND,
	"model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
	"max_tokens": MAX_TOKENS_PER_REQUEST,
	"temperature": LLM_TEMPERATURE,
	"timeout": LLM_TIMEOUT
	},
	"chunking": {
	"max_tokens": MAX_CHUNK_TOKENS,
	"overlap": OVERLAP_TOKENS
	},
	"quality": {
	"min_score": MIN_QUALITY_SCORE,
	"min_words": MIN_WORD_COUNT
	},
	"files": {
	"max_size_mb": MAX_FILE_SIZE_MB,
	"max_per_batch": MAX_FILES_PER_BATCH,
	"supported": SUPPORTED_FORMATS
	},
	"output": {
	"directory": OUTPUT_DIR,
	"csv": CSV_FILENAME,
	"pdf": PDF_FILENAME
	},
	"debug": DEBUG_MODE,
	"caching": ENABLE_CACHING,
	"parallel": ENABLE_PARALLEL_PROCESSING
	}


	def print_config():
	"""Print current configuration"""
	config = get_config()
	print("=" * 60)
	print("TRANSCRIPTORAI CONFIGURATION")
	print("=" * 60)
	for section, settings in config.items():
	print(f"\n{section.upper()}:")
	for key, value in settings.items():
	print(f" {key}: {value}")
	print("=" * 60)


	def validate_config() -> bool:
	"""Validate configuration settings"""
	issues = []

	# Check LLM configuration
	if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
	issues.append("HF API selected but HUGGINGFACE_TOKEN not set")

	if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
	issues.append("OpenAI selected but OPENAI_API_KEY not set")

	# Check paths exist
	if not os.path.exists(OUTPUT_DIR):
	try:
	os.makedirs(OUTPUT_DIR)
	except:
	issues.append(f"Cannot create output directory: {OUTPUT_DIR}")

	# Check reasonable values
	if MAX_CHUNK_TOKENS < 500:
	issues.append("MAX_CHUNK_TOKENS too small (< 500)")

	if MAX_TOKENS_PER_REQUEST < 100:
	issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")

	if issues:
	print("Configuration Issues:")
	for issue in issues:
	print(f" - {issue}")
	return False

	return True


	# ============================================================================
	# INITIALIZATION
	# ============================================================================

	if __name__ == "__main__":
	print_config()
	if validate_config():
	print("\n✓ Configuration valid")
	else:
	print("\n✗ Configuration has issues")