jmisak commited on
Commit
a233900
·
verified ·
1 Parent(s): faacab1

Update config.py

Browse files
Files changed (1) hide show
  1. config.py +282 -282
config.py CHANGED
@@ -1,283 +1,283 @@
1
- import os
2
- from typing import Dict, Any
3
-
4
- # ============================================================================
5
- # LLM CONFIGURATION
6
- # ============================================================================
7
-
8
- # Choose LLM backend: "hf_api" (recommended), "local", or "openai"
9
- LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
10
-
11
- # Hugging Face Configuration
12
- HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
13
- HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
14
-
15
- # Local Model Configuration
16
- LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
17
- DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps"
18
-
19
- # OpenAI Configuration (if using OpenAI)
20
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
21
- OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
22
-
23
- # LLM Parameters
24
- MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
25
- LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
26
- LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
27
-
28
- # ============================================================================
29
- # CHUNKING CONFIGURATION
30
- # ============================================================================
31
-
32
- MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
33
- OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
34
- TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
35
-
36
- # ============================================================================
37
- # QUALITY THRESHOLDS
38
- # ============================================================================
39
-
40
- MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
41
- MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
42
- MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
43
-
44
- # Quality grade thresholds
45
- QUALITY_EXCELLENT = 0.8
46
- QUALITY_GOOD = 0.6
47
- QUALITY_FAIR = 0.4
48
-
49
- # ============================================================================
50
- # FILE PROCESSING CONFIGURATION
51
- # ============================================================================
52
-
53
- MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
54
- SUPPORTED_FORMATS = [".docx", ".pdf"]
55
- MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
56
-
57
- # ============================================================================
58
- # OUTPUT CONFIGURATION
59
- # ============================================================================
60
-
61
- OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
62
- CSV_FILENAME = "transcript_analysis.csv"
63
- PDF_FILENAME = "transcript_report.pdf"
64
-
65
- # Ensure output directory exists
66
- os.makedirs(OUTPUT_DIR, exist_ok=True)
67
-
68
- # ============================================================================
69
- # DEBUG AND LOGGING
70
- # ============================================================================
71
-
72
- DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
73
- VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
74
- LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
75
-
76
- # ============================================================================
77
- # ADVANCED SETTINGS
78
- # ============================================================================
79
-
80
- # Cache extracted text to avoid re-processing
81
- ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
82
- CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
83
-
84
- # Parallel processing
85
- ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
86
- MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
87
-
88
- # ============================================================================
89
- # SYSTEM PROMPTS
90
- # ============================================================================
91
-
92
- BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
93
-
94
- Your task is to extract structured, actionable insights from interview transcripts.
95
-
96
- Core Principles:
97
- - Focus on factual, verifiable medical information
98
- - Distinguish between speaker roles accurately
99
- - Filter out pleasantries, disclaimers, and off-topic content
100
- - Extract specific medical terms, dosages, and treatment details
101
- - Identify patterns and clinical reasoning
102
- - Maintain objectivity and clinical accuracy
103
- """
104
-
105
- HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
106
- Healthcare Professional Analysis Focus:
107
- - Prescribing patterns and medication choices
108
- - Diagnostic reasoning and clinical decision-making
109
- - Treatment protocols and guidelines referenced
110
- - Peer perspectives on efficacy and safety
111
- - Barriers to treatment or adoption
112
- - Off-label uses or emerging practices
113
-
114
- Extract and structure:
115
- 1. Diagnoses mentioned with context
116
- 2. Prescriptions with dosage, frequency, and rationale
117
- 3. Treatment strategies and their justifications
118
- 4. Clinical guidelines or studies referenced
119
- 5. Challenges or barriers discussed
120
- 6. Key clinical insights or pearls
121
- """
122
-
123
- PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
124
- Patient Interview Analysis Focus:
125
- - Symptom descriptions and severity
126
- - Treatment experiences and outcomes
127
- - Side effects and tolerability
128
- - Quality of life impacts
129
- - Adherence challenges and enablers
130
- - Emotional and psychological factors
131
- - Healthcare system interactions
132
-
133
- Extract and structure:
134
- 1. Primary symptoms with duration and severity
135
- 2. Current and past treatments
136
- 3. Treatment effectiveness and satisfaction
137
- 4. Side effects experienced
138
- 5. Concerns and unmet needs
139
- 6. Quality of life impacts
140
- 7. Support systems and resources
141
- """
142
-
143
- SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
144
-
145
- Focus on:
146
- - Frequency analysis (how many interviewees mentioned X?)
147
- - Common patterns and themes
148
- - Consensus and disagreements
149
- - Statistical insights (percentages, distributions)
150
- - Actionable recommendations for stakeholders
151
-
152
- Provide:
153
- 1. Quantitative summary (X% mentioned Y)
154
- 2. Key trends and patterns
155
- 3. Notable outliers or unique insights
156
- 4. Actionable recommendations
157
- 5. Data gaps or areas needing follow-up
158
- """
159
-
160
- # ============================================================================
161
- # VALIDATION SETTINGS
162
- # ============================================================================
163
-
164
- VALIDATION_CONFIG = {
165
- "min_word_ratio": 0.3,
166
- "max_repetition_ratio": 1.5,
167
- "min_sentences": 3,
168
- "check_errors": True,
169
- "check_gibberish": True
170
- }
171
-
172
- # ============================================================================
173
- # DASHBOARD SETTINGS
174
- # ============================================================================
175
-
176
- DASHBOARD_CONFIG = {
177
- "figure_size": (14, 10),
178
- "dpi": 100,
179
- "style": "default",
180
- "top_n_items": 8,
181
- "color_scheme": {
182
- "primary": "#3498db",
183
- "secondary": "#2ecc71",
184
- "accent": "#e74c3c",
185
- "warning": "#f39c12"
186
- }
187
- }
188
-
189
- # ============================================================================
190
- # HELPER FUNCTIONS
191
- # ============================================================================
192
-
193
- def get_config() -> Dict[str, Any]:
194
- """Return all configuration as a dictionary"""
195
- return {
196
- "llm": {
197
- "backend": LLM_BACKEND,
198
- "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
199
- "max_tokens": MAX_TOKENS_PER_REQUEST,
200
- "temperature": LLM_TEMPERATURE,
201
- "timeout": LLM_TIMEOUT
202
- },
203
- "chunking": {
204
- "max_tokens": MAX_CHUNK_TOKENS,
205
- "overlap": OVERLAP_TOKENS
206
- },
207
- "quality": {
208
- "min_score": MIN_QUALITY_SCORE,
209
- "min_words": MIN_WORD_COUNT
210
- },
211
- "files": {
212
- "max_size_mb": MAX_FILE_SIZE_MB,
213
- "max_per_batch": MAX_FILES_PER_BATCH,
214
- "supported": SUPPORTED_FORMATS
215
- },
216
- "output": {
217
- "directory": OUTPUT_DIR,
218
- "csv": CSV_FILENAME,
219
- "pdf": PDF_FILENAME
220
- },
221
- "debug": DEBUG_MODE,
222
- "caching": ENABLE_CACHING,
223
- "parallel": ENABLE_PARALLEL_PROCESSING
224
- }
225
-
226
-
227
- def print_config():
228
- """Print current configuration"""
229
- config = get_config()
230
- print("=" * 60)
231
- print("TRANSCRIPTORAI CONFIGURATION")
232
- print("=" * 60)
233
- for section, settings in config.items():
234
- print(f"\n{section.upper()}:")
235
- for key, value in settings.items():
236
- print(f" {key}: {value}")
237
- print("=" * 60)
238
-
239
-
240
- def validate_config() -> bool:
241
- """Validate configuration settings"""
242
- issues = []
243
-
244
- # Check LLM configuration
245
- if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
246
- issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
247
-
248
- if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
249
- issues.append("OpenAI selected but OPENAI_API_KEY not set")
250
-
251
- # Check paths exist
252
- if not os.path.exists(OUTPUT_DIR):
253
- try:
254
- os.makedirs(OUTPUT_DIR)
255
- except:
256
- issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
257
-
258
- # Check reasonable values
259
- if MAX_CHUNK_TOKENS < 500:
260
- issues.append("MAX_CHUNK_TOKENS too small (< 500)")
261
-
262
- if MAX_TOKENS_PER_REQUEST < 100:
263
- issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
264
-
265
- if issues:
266
- print("Configuration Issues:")
267
- for issue in issues:
268
- print(f" - {issue}")
269
- return False
270
-
271
- return True
272
-
273
-
274
- # ============================================================================
275
- # INITIALIZATION
276
- # ============================================================================
277
-
278
- if __name__ == "__main__":
279
- print_config()
280
- if validate_config():
281
- print("\n✓ Configuration valid")
282
- else:
283
  print("\n✗ Configuration has issues")
 
1
+ import os
2
+ from typing import Dict, Any
3
+
4
+ # ============================================================================
5
+ # LLM CONFIGURATION
6
+ # ============================================================================
7
+
8
+ # Choose LLM backend: "hf_api" (recommended), "local", or "openai"
9
+ LLM_BACKEND = "hf_api" # Forced for HF Spaces
10
+
11
+ # Hugging Face Configuration
12
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
13
+ HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" # Lighter for Spaces
14
+
15
+ # Local Model Configuration
16
+ LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
17
+ DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps"
18
+
19
+ # OpenAI Configuration (if using OpenAI)
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
21
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
22
+
23
+ # LLM Parameters
24
+ MAX_TOKENS_PER_REQUEST = 100 # Faster for Spaces
25
+ LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
26
+ LLM_TIMEOUT = 25 # Spaces timeout limit
27
+
28
+ # ============================================================================
29
+ # CHUNKING CONFIGURATION
30
+ # ============================================================================
31
+
32
+ MAX_CHUNK_TOKENS = 2000 # Lighter for Spaces
33
+ OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
34
+ TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
35
+
36
+ # ============================================================================
37
+ # QUALITY THRESHOLDS
38
+ # ============================================================================
39
+
40
+ MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
41
+ MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
42
+ MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
43
+
44
+ # Quality grade thresholds
45
+ QUALITY_EXCELLENT = 0.8
46
+ QUALITY_GOOD = 0.6
47
+ QUALITY_FAIR = 0.4
48
+
49
+ # ============================================================================
50
+ # FILE PROCESSING CONFIGURATION
51
+ # ============================================================================
52
+
53
+ MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
54
+ SUPPORTED_FORMATS = [".docx", ".pdf"]
55
+ MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
56
+
57
+ # ============================================================================
58
+ # OUTPUT CONFIGURATION
59
+ # ============================================================================
60
+
61
+ OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
62
+ CSV_FILENAME = "transcript_analysis.csv"
63
+ PDF_FILENAME = "transcript_report.pdf"
64
+
65
+ # Ensure output directory exists
66
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
67
+
68
+ # ============================================================================
69
+ # DEBUG AND LOGGING
70
+ # ============================================================================
71
+
72
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
73
+ VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
74
+ LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
75
+
76
+ # ============================================================================
77
+ # ADVANCED SETTINGS
78
+ # ============================================================================
79
+
80
+ # Cache extracted text to avoid re-processing
81
+ ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
82
+ CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
83
+
84
+ # Parallel processing
85
+ ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
86
+ MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
87
+
88
+ # ============================================================================
89
+ # SYSTEM PROMPTS
90
+ # ============================================================================
91
+
92
+ BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
93
+
94
+ Your task is to extract structured, actionable insights from interview transcripts.
95
+
96
+ Core Principles:
97
+ - Focus on factual, verifiable medical information
98
+ - Distinguish between speaker roles accurately
99
+ - Filter out pleasantries, disclaimers, and off-topic content
100
+ - Extract specific medical terms, dosages, and treatment details
101
+ - Identify patterns and clinical reasoning
102
+ - Maintain objectivity and clinical accuracy
103
+ """
104
+
105
+ HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
106
+ Healthcare Professional Analysis Focus:
107
+ - Prescribing patterns and medication choices
108
+ - Diagnostic reasoning and clinical decision-making
109
+ - Treatment protocols and guidelines referenced
110
+ - Peer perspectives on efficacy and safety
111
+ - Barriers to treatment or adoption
112
+ - Off-label uses or emerging practices
113
+
114
+ Extract and structure:
115
+ 1. Diagnoses mentioned with context
116
+ 2. Prescriptions with dosage, frequency, and rationale
117
+ 3. Treatment strategies and their justifications
118
+ 4. Clinical guidelines or studies referenced
119
+ 5. Challenges or barriers discussed
120
+ 6. Key clinical insights or pearls
121
+ """
122
+
123
+ PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
124
+ Patient Interview Analysis Focus:
125
+ - Symptom descriptions and severity
126
+ - Treatment experiences and outcomes
127
+ - Side effects and tolerability
128
+ - Quality of life impacts
129
+ - Adherence challenges and enablers
130
+ - Emotional and psychological factors
131
+ - Healthcare system interactions
132
+
133
+ Extract and structure:
134
+ 1. Primary symptoms with duration and severity
135
+ 2. Current and past treatments
136
+ 3. Treatment effectiveness and satisfaction
137
+ 4. Side effects experienced
138
+ 5. Concerns and unmet needs
139
+ 6. Quality of life impacts
140
+ 7. Support systems and resources
141
+ """
142
+
143
+ SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
144
+
145
+ Focus on:
146
+ - Frequency analysis (how many interviewees mentioned X?)
147
+ - Common patterns and themes
148
+ - Consensus and disagreements
149
+ - Statistical insights (percentages, distributions)
150
+ - Actionable recommendations for stakeholders
151
+
152
+ Provide:
153
+ 1. Quantitative summary (X% mentioned Y)
154
+ 2. Key trends and patterns
155
+ 3. Notable outliers or unique insights
156
+ 4. Actionable recommendations
157
+ 5. Data gaps or areas needing follow-up
158
+ """
159
+
160
+ # ============================================================================
161
+ # VALIDATION SETTINGS
162
+ # ============================================================================
163
+
164
+ VALIDATION_CONFIG = {
165
+ "min_word_ratio": 0.3,
166
+ "max_repetition_ratio": 1.5,
167
+ "min_sentences": 3,
168
+ "check_errors": True,
169
+ "check_gibberish": True
170
+ }
171
+
172
+ # ============================================================================
173
+ # DASHBOARD SETTINGS
174
+ # ============================================================================
175
+
176
+ DASHBOARD_CONFIG = {
177
+ "figure_size": (14, 10),
178
+ "dpi": 100,
179
+ "style": "default",
180
+ "top_n_items": 8,
181
+ "color_scheme": {
182
+ "primary": "#3498db",
183
+ "secondary": "#2ecc71",
184
+ "accent": "#e74c3c",
185
+ "warning": "#f39c12"
186
+ }
187
+ }
188
+
189
+ # ============================================================================
190
+ # HELPER FUNCTIONS
191
+ # ============================================================================
192
+
193
+ def get_config() -> Dict[str, Any]:
194
+ """Return all configuration as a dictionary"""
195
+ return {
196
+ "llm": {
197
+ "backend": LLM_BACKEND,
198
+ "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
199
+ "max_tokens": MAX_TOKENS_PER_REQUEST,
200
+ "temperature": LLM_TEMPERATURE,
201
+ "timeout": LLM_TIMEOUT
202
+ },
203
+ "chunking": {
204
+ "max_tokens": MAX_CHUNK_TOKENS,
205
+ "overlap": OVERLAP_TOKENS
206
+ },
207
+ "quality": {
208
+ "min_score": MIN_QUALITY_SCORE,
209
+ "min_words": MIN_WORD_COUNT
210
+ },
211
+ "files": {
212
+ "max_size_mb": MAX_FILE_SIZE_MB,
213
+ "max_per_batch": MAX_FILES_PER_BATCH,
214
+ "supported": SUPPORTED_FORMATS
215
+ },
216
+ "output": {
217
+ "directory": OUTPUT_DIR,
218
+ "csv": CSV_FILENAME,
219
+ "pdf": PDF_FILENAME
220
+ },
221
+ "debug": DEBUG_MODE,
222
+ "caching": ENABLE_CACHING,
223
+ "parallel": ENABLE_PARALLEL_PROCESSING
224
+ }
225
+
226
+
227
+ def print_config():
228
+ """Print current configuration"""
229
+ config = get_config()
230
+ print("=" * 60)
231
+ print("TRANSCRIPTORAI CONFIGURATION")
232
+ print("=" * 60)
233
+ for section, settings in config.items():
234
+ print(f"\n{section.upper()}:")
235
+ for key, value in settings.items():
236
+ print(f" {key}: {value}")
237
+ print("=" * 60)
238
+
239
+
240
+ def validate_config() -> bool:
241
+ """Validate configuration settings"""
242
+ issues = []
243
+
244
+ # Check LLM configuration
245
+ if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
246
+ issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
247
+
248
+ if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
249
+ issues.append("OpenAI selected but OPENAI_API_KEY not set")
250
+
251
+ # Check paths exist
252
+ if not os.path.exists(OUTPUT_DIR):
253
+ try:
254
+ os.makedirs(OUTPUT_DIR)
255
+ except:
256
+ issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
257
+
258
+ # Check reasonable values
259
+ if MAX_CHUNK_TOKENS < 500:
260
+ issues.append("MAX_CHUNK_TOKENS too small (< 500)")
261
+
262
+ if MAX_TOKENS_PER_REQUEST < 100:
263
+ issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
264
+
265
+ if issues:
266
+ print("Configuration Issues:")
267
+ for issue in issues:
268
+ print(f" - {issue}")
269
+ return False
270
+
271
+ return True
272
+
273
+
274
+ # ============================================================================
275
+ # INITIALIZATION
276
+ # ============================================================================
277
+
278
+ if __name__ == "__main__":
279
+ print_config()
280
+ if validate_config():
281
+ print("\n✓ Configuration valid")
282
+ else:
283
  print("\n✗ Configuration has issues")