File size: 9,519 Bytes
a233900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54c99ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import os
from typing import Dict, Any

# ============================================================================
# LLM CONFIGURATION
# ============================================================================

# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
LLM_BACKEND = "hf_api"  # Forced for HF Spaces

# Hugging Face Configuration
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Lighter for Spaces

# Local Model Configuration
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
DEVICE = os.getenv("DEVICE", "auto")  # "auto", "cpu", "cuda", "mps"

# OpenAI Configuration (if using OpenAI)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")

# LLM Parameters
MAX_TOKENS_PER_REQUEST = 100  # Faster for Spaces
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
LLM_TIMEOUT = 25  # Spaces timeout limit

# ============================================================================
# CHUNKING CONFIGURATION
# ============================================================================

MAX_CHUNK_TOKENS = 2000  # Lighter for Spaces
OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")

# ============================================================================
# QUALITY THRESHOLDS
# ============================================================================

MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))

# Quality grade thresholds
QUALITY_EXCELLENT = 0.8
QUALITY_GOOD = 0.6
QUALITY_FAIR = 0.4

# ============================================================================
# FILE PROCESSING CONFIGURATION
# ============================================================================

MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
SUPPORTED_FORMATS = [".docx", ".pdf"]
MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))

# ============================================================================
# OUTPUT CONFIGURATION
# ============================================================================

OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
CSV_FILENAME = "transcript_analysis.csv"
PDF_FILENAME = "transcript_report.pdf"

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================================================================
# DEBUG AND LOGGING
# ============================================================================

DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")

# ============================================================================
# ADVANCED SETTINGS
# ============================================================================

# Cache extracted text to avoid re-processing
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")

# Parallel processing
ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))

# ============================================================================
# SYSTEM PROMPTS
# ============================================================================

BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.

Your task is to extract structured, actionable insights from interview transcripts.

Core Principles:
- Focus on factual, verifiable medical information
- Distinguish between speaker roles accurately
- Filter out pleasantries, disclaimers, and off-topic content
- Extract specific medical terms, dosages, and treatment details
- Identify patterns and clinical reasoning
- Maintain objectivity and clinical accuracy
"""

HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
Healthcare Professional Analysis Focus:
- Prescribing patterns and medication choices
- Diagnostic reasoning and clinical decision-making
- Treatment protocols and guidelines referenced
- Peer perspectives on efficacy and safety
- Barriers to treatment or adoption
- Off-label uses or emerging practices

Extract and structure:
1. Diagnoses mentioned with context
2. Prescriptions with dosage, frequency, and rationale
3. Treatment strategies and their justifications
4. Clinical guidelines or studies referenced
5. Challenges or barriers discussed
6. Key clinical insights or pearls
"""

PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
Patient Interview Analysis Focus:
- Symptom descriptions and severity
- Treatment experiences and outcomes
- Side effects and tolerability
- Quality of life impacts
- Adherence challenges and enablers
- Emotional and psychological factors
- Healthcare system interactions

Extract and structure:
1. Primary symptoms with duration and severity
2. Current and past treatments
3. Treatment effectiveness and satisfaction
4. Side effects experienced
5. Concerns and unmet needs
6. Quality of life impacts
7. Support systems and resources
"""

SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.

Focus on:
- Frequency analysis (how many interviewees mentioned X?)
- Common patterns and themes
- Consensus and disagreements
- Statistical insights (percentages, distributions)
- Actionable recommendations for stakeholders

Provide:
1. Quantitative summary (X% mentioned Y)
2. Key trends and patterns
3. Notable outliers or unique insights
4. Actionable recommendations
5. Data gaps or areas needing follow-up
"""

# ============================================================================
# VALIDATION SETTINGS
# ============================================================================

VALIDATION_CONFIG = {
    "min_word_ratio": 0.3,
    "max_repetition_ratio": 1.5,
    "min_sentences": 3,
    "check_errors": True,
    "check_gibberish": True
}

# ============================================================================
# DASHBOARD SETTINGS
# ============================================================================

DASHBOARD_CONFIG = {
    "figure_size": (14, 10),
    "dpi": 100,
    "style": "default",
    "top_n_items": 8,
    "color_scheme": {
        "primary": "#3498db",
        "secondary": "#2ecc71",
        "accent": "#e74c3c",
        "warning": "#f39c12"
    }
}

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def get_config() -> Dict[str, Any]:
    """Return all configuration as a dictionary"""
    return {
        "llm": {
            "backend": LLM_BACKEND,
            "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
            "max_tokens": MAX_TOKENS_PER_REQUEST,
            "temperature": LLM_TEMPERATURE,
            "timeout": LLM_TIMEOUT
        },
        "chunking": {
            "max_tokens": MAX_CHUNK_TOKENS,
            "overlap": OVERLAP_TOKENS
        },
        "quality": {
            "min_score": MIN_QUALITY_SCORE,
            "min_words": MIN_WORD_COUNT
        },
        "files": {
            "max_size_mb": MAX_FILE_SIZE_MB,
            "max_per_batch": MAX_FILES_PER_BATCH,
            "supported": SUPPORTED_FORMATS
        },
        "output": {
            "directory": OUTPUT_DIR,
            "csv": CSV_FILENAME,
            "pdf": PDF_FILENAME
        },
        "debug": DEBUG_MODE,
        "caching": ENABLE_CACHING,
        "parallel": ENABLE_PARALLEL_PROCESSING
    }


def print_config():
    """Print current configuration"""
    config = get_config()
    print("=" * 60)
    print("TRANSCRIPTORAI CONFIGURATION")
    print("=" * 60)
    for section, settings in config.items():
        print(f"\n{section.upper()}:")
        for key, value in settings.items():
            print(f"  {key}: {value}")
    print("=" * 60)


def validate_config() -> bool:
    """Validate configuration settings"""
    issues = []
    
    # Check LLM configuration
    if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
        issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
    
    if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
        issues.append("OpenAI selected but OPENAI_API_KEY not set")
    
    # Check paths exist
    if not os.path.exists(OUTPUT_DIR):
        try:
            os.makedirs(OUTPUT_DIR)
        except:
            issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
    
    # Check reasonable values
    if MAX_CHUNK_TOKENS < 500:
        issues.append("MAX_CHUNK_TOKENS too small (< 500)")
    
    if MAX_TOKENS_PER_REQUEST < 100:
        issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
    
    if issues:
        print("Configuration Issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    
    return True


# ============================================================================
# INITIALIZATION
# ============================================================================

if __name__ == "__main__":
    print_config()
    if validate_config():
        print("\n✓ Configuration valid")
    else:
        print("\n✗ Configuration has issues")