dipsikha25 commited on
Commit
7e8d37e
·
verified ·
1 Parent(s): 8688135

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -1034
app.py CHANGED
@@ -1,1089 +1,112 @@
1
- # app.py
2
- # Pharma KPI Copilot
3
- # - Auto-loads KPI Glossary Excel from same folder as app.py
4
- # - Reads PDF for KPI definition / formula / notes
5
- # - Fixes Excel mapping so report names show instead of "Not mapped"
6
- # - Displays report / offering values as colored badges
7
- # - Installs openpyxl automatically if missing
8
-
9
  import os
10
  import re
11
- import sys
12
- import subprocess
13
- import importlib.util
14
- import unicodedata
15
  from pathlib import Path
16
- from difflib import SequenceMatcher
17
-
18
-
19
- def ensure_package(package_name: str):
20
- if importlib.util.find_spec(package_name) is None:
21
- print(f"Package '{package_name}' not found. Installing...")
22
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', package_name])
23
- print(f"Package '{package_name}' installed successfully.")
24
-
25
-
26
- # Required for pandas Excel engine
27
- ensure_package('openpyxl')
28
 
29
  import gradio as gr
30
  import pandas as pd
31
  from langchain_community.document_loaders import PyPDFLoader
32
  from langchain_text_splitters import RecursiveCharacterTextSplitter
33
 
34
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
35
-
36
- SERVICENOW_INCIDENT_URL = os.getenv(
37
- 'SERVICENOW_INCIDENT_URL',
38
- 'https://sanofiservices.service-now.com/onesupport?id=sc_cat_item&sys_id=a5c743d39761b19cbb28fa871153afc3',
39
- )
40
- PDF_FILE = 'data.pdf'
41
- DEFAULT_KPI_EXCEL = 'CIA Consolidated KPIs_MetricsGovernance (1).xlsx'
42
-
43
- REPORT_FLAG_COLUMNS = [
44
- 'SFE', 'B360', 'OMNICHANNEL', 'C360', 'E&C', 'AC',
45
- 'Field Reporting', 'Content Reporting', 'Above Country', 'Country'
46
- ]
47
-
48
- EXTRA_INFO_COLUMNS = [
49
- 'Placement in Offering', 'Calculated at:', 'Domain', 'Interaction', 'Channels', 'PowerBI Field/Measure'
50
- ]
51
-
52
- MANUAL_ALIAS_MAP = {
53
- # 'hcp reach in occp': 'HCPs in OCCP',
54
- }
55
 
 
 
56
 
57
  # =========================================================
58
- # 1) TEXT HELPERS
59
  # =========================================================
 
60
  def fix_pdf_text(text: str) -> str:
61
  if not text:
62
- return ''
63
- text = unicodedata.normalize('NFKC', text)
64
- replacements = {
65
- 'fi': 'fi', 'fl': 'fl', '“': '"', '”': '"', '’': "'", '‘': "'", '–': '-', '—': '-', '\u00ad': '',
66
- }
67
- for bad, good in replacements.items():
68
- text = text.replace(bad, good)
69
- text = re.sub(r'(?<=\w)[θΘϑϴƟɵ](?=\w)', 'ti', text)
70
  return text
71
 
72
-
73
  def normalize_exact(text: str) -> str:
74
- text = fix_pdf_text(text or '').lower().strip()
75
- return re.sub(r'\s+', ' ', text)
76
-
77
-
78
- def singularize_token(token: str) -> str:
79
- token = token.strip().lower()
80
- if len(token) > 4 and token.endswith('ies'):
81
- return token[:-3] + 'y'
82
- if len(token) > 3 and token.endswith('s') and not token.endswith('ss'):
83
- return token[:-1]
84
- return token
85
-
86
-
87
- def normalize_loose(text: str) -> str:
88
- text = fix_pdf_text(text or '').lower().strip()
89
- text = text.replace('#', ' ').replace('%', ' ')
90
- text = re.sub(r'[^a-z0-9]+', ' ', text)
91
- text = re.sub(r'\s+', ' ', text).strip()
92
- if not text:
93
- return ''
94
- return ' '.join(singularize_token(tok) for tok in text.split())
95
-
96
-
97
- def tokenize_loose(text: str):
98
- loose = normalize_loose(text)
99
- return loose.split() if loose else []
100
-
101
-
102
- STOPWORDS = {
103
- 'a', 'an', 'the', 'in', 'of', 'with', 'and', 'or', 'for', 'to', 'by', 'on',
104
- 'this', 'that', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
105
- 'what', 'how', 'why', 'show', 'give', 'tell', 'me', 'please', 'explain',
106
- 'search', 'find', 'calculated', 'computed', 'measured', 'formula', 'mean', 'important',
107
- }
108
-
109
-
110
- def significant_tokens(text: str):
111
- toks = tokenize_loose(text)
112
- sig = [t for t in toks if t not in STOPWORDS]
113
- return sig if sig else toks
114
-
115
 
116
  def clean_user_query(text: str) -> str:
117
- text = fix_pdf_text(text or '').strip()
118
- text = re.sub(r'[?]+$', '', text).strip()
119
- patterns = [
120
- r'^what is\s+', r'^what s\s+', r'^show me\s+', r'^give me\s+', r'^tell me\s+',
121
- r'^explain\s+', r'^find\s+', r'^search\s+for\s+', r'^how is\s+', r'^why is\s+',
122
- ]
123
- lowered = text.lower()
124
- for pat in patterns:
125
- lowered = re.sub(pat, '', lowered).strip()
126
- return lowered.strip()
127
-
128
-
129
- def clean_formula_text(text: str) -> str:
130
- text = fix_pdf_text(text or '').lower()
131
- text = re.sub(r'--.*', '', text)
132
- text = re.sub(r'\s+', '', text)
133
- return text
134
-
135
-
136
- def html_escape(text: str) -> str:
137
- if text is None:
138
- return ''
139
- return (
140
- str(text)
141
- .replace('&', '&amp;')
142
- .replace('<', '&lt;')
143
- .replace('>', '&gt;')
144
- .replace('"', '&quot;')
145
- )
146
-
147
-
148
- def nl2br(text: str) -> str:
149
- return html_escape(fix_pdf_text(text)).replace('\n', '<br>')
150
-
151
-
152
- def is_generic_followup_question(text: str) -> bool:
153
- q = normalize_exact(text)
154
- generic_patterns = [
155
- r'^how is this calculated', r'^how is this computed', r'^how is this measured',
156
- r'^what is the formula', r'^show formula', r'^show the formula', r'^give formula',
157
- r'^why is this important', r'^explain this', r'^what does this mean',
158
- ]
159
- return any(re.search(p, q) for p in generic_patterns)
160
-
161
-
162
- def extract_kpi_name_from_notes(notes_text: str) -> str:
163
- if not notes_text:
164
- return ''
165
- m = re.search(r'\*\*KPI Name:\*\*\s*(.+)', notes_text)
166
- return m.group(1).strip() if m else ''
167
-
168
-
169
- def resolve_alias(user_query: str):
170
- cleaned = clean_user_query(user_query)
171
- q = normalize_loose(cleaned)
172
- if not q:
173
- return user_query, None, None
174
- alias_map_norm = {normalize_loose(k): v for k, v in MANUAL_ALIAS_MAP.items()}
175
- if q in alias_map_norm:
176
- return alias_map_norm[q], q, alias_map_norm[q]
177
- return cleaned, None, None
178
-
179
 
180
  # =========================================================
181
- # 2) EXCEL LOADING AND MAPPING
182
  # =========================================================
183
- def is_truthy_excel_value(value):
184
- if pd.isna(value):
185
- return False
186
- return str(value).strip().lower() in {'yes', 'y', 'true', '1', 'x'}
187
-
188
-
189
- def detect_glossary_header_row(raw_df: pd.DataFrame):
190
- """Find the real KPI Glossary header row."""
191
- for idx in range(min(len(raw_df), 60)):
192
- row_values = [normalize_exact(str(v)).replace('/', ' ') for v in raw_df.iloc[idx].tolist()]
193
- if 'metrics kpis' in row_values and 'powerbi field measure' in row_values:
194
- return idx
195
- joined = ' | '.join(row_values)
196
- if 'metrics kpis' in joined and ('powerbi field measure' in joined or 'definitions' in joined):
197
- return idx
198
- return None
199
-
200
-
201
- def build_glossary_dataframe(excel_path: str):
202
- raw = pd.read_excel(excel_path, sheet_name='KPI Glossary', header=None, engine='openpyxl')
203
- header_row = detect_glossary_header_row(raw)
204
- if header_row is None:
205
- return None, None
206
-
207
- header = [str(x).strip() for x in raw.iloc[header_row].tolist()]
208
- data = raw.iloc[header_row + 1:].copy().reset_index(drop=True)
209
- data.columns = header
210
- data = data.dropna(how='all')
211
- keep_cols = [str(c).strip() != '' and str(c).strip().lower() != 'nan' for c in data.columns]
212
- data = data.loc[:, keep_cols]
213
- data.columns = [str(c).strip() for c in data.columns]
214
- return data, header_row
215
-
216
-
217
- def merge_excel_record(a: dict, b: dict):
218
- if not a:
219
- return b
220
- if not b:
221
- return a
222
- merged = {
223
- 'kpi_name': a.get('kpi_name') or b.get('kpi_name', ''),
224
- 'measure_name': a.get('measure_name') or b.get('measure_name', ''),
225
- 'report_sources': sorted(set(a.get('report_sources', [])) | set(b.get('report_sources', []))),
226
- 'extra_info': {},
227
- 'row_ids': sorted(set(a.get('row_ids', [])) | set(b.get('row_ids', []))),
228
- }
229
- for col in EXTRA_INFO_COLUMNS:
230
- vals = []
231
- for rec in (a, b):
232
- val = rec.get('extra_info', {}).get(col)
233
- if val and val not in vals:
234
- vals.append(val)
235
- if vals:
236
- merged['extra_info'][col] = ' | '.join(vals)
237
- return merged
238
-
239
-
240
- def add_record_to_mapping(mapping: dict, key: str, record: dict):
241
- if not key:
242
- return
243
- mapping[key] = merge_excel_record(mapping.get(key), record) if key in mapping else record
244
-
245
-
246
- def load_kpi_excel_mapping(excel_path: str):
247
- if not excel_path or not Path(excel_path).exists():
248
- print(f'Excel not found: {excel_path}')
249
- return {}
250
 
 
251
  try:
252
- df, header_row = build_glossary_dataframe(excel_path)
 
 
253
  except Exception as e:
254
- print(f'Could not read KPI Glossary sheet: {e}')
255
- return {}
256
-
257
- if df is None or df.empty:
258
- print('Could not detect KPI Glossary header row or data is empty.')
259
- return {}
260
-
261
- print(f'KPI Glossary header row detected at: {header_row}')
262
- print(f'KPI Glossary columns detected: {list(df.columns)[:20]}')
263
-
264
- kpi_col = 'Metrics/KPIs' if 'Metrics/KPIs' in df.columns else None
265
- measure_col = 'PowerBI Field/Measure' if 'PowerBI Field/Measure' in df.columns else None
266
- if not kpi_col and not measure_col:
267
- print('Metrics/KPIs and PowerBI Field/Measure columns not found.')
268
- return {}
269
-
270
- mapping = {}
271
- for idx, row in df.iterrows():
272
- kpi_name = str(row.get(kpi_col, '')).strip() if kpi_col else ''
273
- measure_name = str(row.get(measure_col, '')).strip() if measure_col else ''
274
- if not kpi_name and not measure_name:
275
- continue
276
-
277
- report_sources = [col for col in REPORT_FLAG_COLUMNS if col in df.columns and is_truthy_excel_value(row.get(col))]
278
-
279
- extra_info = {}
280
- for col in EXTRA_INFO_COLUMNS:
281
- if col in df.columns:
282
- val = row.get(col)
283
- if pd.notna(val) and str(val).strip():
284
- extra_info[col] = str(val).strip()
285
-
286
- record = {
287
- 'kpi_name': kpi_name,
288
- 'measure_name': measure_name,
289
- 'report_sources': sorted(set(report_sources)),
290
- 'extra_info': extra_info,
291
- 'row_ids': [int(idx)],
292
- }
293
-
294
- if kpi_name:
295
- add_record_to_mapping(mapping, normalize_loose(kpi_name), record)
296
- if measure_name:
297
- add_record_to_mapping(mapping, normalize_loose(measure_name), record)
298
-
299
- print(f'Final mapped KPI keys: {len(mapping)}')
300
- return mapping
301
-
302
-
303
- def excel_candidate_keys(*texts):
304
- keys = []
305
- for t in texts:
306
- if not t:
307
- continue
308
- k = normalize_loose(t)
309
- if k and k not in keys:
310
- keys.append(k)
311
- return keys
312
-
313
-
314
- def excel_token_coverage_score(query_key: str, candidate_key: str):
315
- q_tokens = significant_tokens(query_key)
316
- c_tokens = significant_tokens(candidate_key)
317
- if not q_tokens or not c_tokens:
318
- return 0.0, 0
319
- q_set, c_set = set(q_tokens), set(c_tokens)
320
- overlap = q_set & c_set
321
- return len(overlap) / max(len(q_set), 1), len(overlap)
322
-
323
-
324
- def lookup_kpi_excel_info(kpi_name: str, measure_name: str, excel_mapping: dict, query_text: str = None):
325
- if not excel_mapping:
326
- return None
327
- keys = excel_candidate_keys(query_text, kpi_name, measure_name)
328
- result = None
329
-
330
- # exact lookup
331
- for key in keys:
332
- if key in excel_mapping:
333
- result = merge_excel_record(result, excel_mapping[key]) if result else excel_mapping[key]
334
- if result:
335
- return result
336
-
337
- # fuzzy fallback
338
- best_key = None
339
- best_ratio = 0.0
340
- for q in keys:
341
- for cand in excel_mapping.keys():
342
- coverage, overlap = excel_token_coverage_score(q, cand)
343
- ratio = SequenceMatcher(None, q, cand).ratio()
344
- if coverage >= 1.0 or ratio >= 0.84 or (overlap >= 2 and ratio >= 0.70):
345
- if ratio > best_ratio:
346
- best_ratio = ratio
347
- best_key = cand
348
- return excel_mapping.get(best_key) if best_key else None
349
-
350
-
351
- def load_default_excel_if_present():
352
- return load_kpi_excel_mapping(DEFAULT_KPI_EXCEL) if Path(DEFAULT_KPI_EXCEL).exists() else {}
353
-
354
 
355
  # =========================================================
356
- # 3) PDF LOAD / PARSE
357
  # =========================================================
358
- loader = PyPDFLoader(PDF_FILE)
359
- page_docs = loader.load()
360
- for d in page_docs:
361
- d.page_content = fix_pdf_text(d.page_content)
362
-
363
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=220)
364
- chunk_docs = splitter.split_documents(page_docs)
365
-
366
-
367
- def normalize_lines(text: str):
368
- return [line.strip() for line in fix_pdf_text(text).splitlines() if line.strip()]
369
-
370
-
371
- def is_metadata_line(line: str) -> bool:
372
- line = normalize_loose(line)
373
- patterns = [
374
- r'^name$',r'^measure name', r'^description$', r'^definition$',
375
- r'^business meaning$', r'^category$', r'^owner$', r'^source$', r'^dashboard$', r'^glossary$',
376
- ]
377
- return any(re.search(p, line) for p in patterns)
378
-
379
-
380
- def looks_like_formula_start(line: str) -> bool:
381
- line = fix_pdf_text(line)
382
- low = line.lower().strip()
383
- formula_starts = [
384
- 'calculate(', 'sum(', 'count(', 'distinctcount(', 'divide(', 'if(', 'filter(',
385
- 'removefilters(', 'all(', 'average(', 'var ', 'return', 'switch(', 'countrows(',
386
- 'summarize(', 'lookupvalue(', 'selectedvalue(',
387
- ]
388
- if any(fs in low for fs in formula_starts):
389
- return True
390
- if '[' in line and ']' in line:
391
- return True
392
- if '=' in line:
393
- return True
394
- return False
395
-
396
-
397
- def extract_named_field(lines, labels):
398
- wanted = [normalize_loose(x) for x in labels]
399
- for i, line in enumerate(lines):
400
- if normalize_loose(line) in wanted and i + 1 < len(lines):
401
- return fix_pdf_text(lines[i + 1].strip())
402
- return ''
403
 
 
 
404
 
405
- def extract_label_block(lines, labels):
406
- wanted = [normalize_loose(x) for x in labels]
407
- start_idx = None
408
- for i, line in enumerate(lines):
409
- if normalize_loose(line) in wanted:
410
- start_idx = i + 1
411
- break
412
- if start_idx is None:
413
- return ''
414
- collected = []
415
- for j in range(start_idx, len(lines)):
416
- current = fix_pdf_text(lines[j].strip())
417
- if is_metadata_line(current) and normalize_loose(current) not in wanted:
418
- break
419
- collected.append(current)
420
- return ' '.join(collected).strip()
421
-
422
-
423
- def extract_formula(lines):
424
- formula_lines = []
425
- in_formula = False
426
- paren_balance = 0
427
- for i, line in enumerate(lines):
428
- line = fix_pdf_text(line.strip())
429
- if not in_formula and looks_like_formula_start(line):
430
- in_formula = True
431
- formula_lines.append(line)
432
- paren_balance += line.count('(') - line.count(')')
433
- continue
434
- if in_formula:
435
- if is_metadata_line(line) and paren_balance <= 0:
436
- break
437
- formula_lines.append(line)
438
- paren_balance += line.count('(') - line.count(')')
439
- if paren_balance <= 0:
440
- next_line = fix_pdf_text(lines[i + 1].strip()) if i + 1 < len(lines) else ''
441
- if next_line and is_metadata_line(next_line):
442
- break
443
- return '\n'.join(formula_lines).strip()
444
-
445
-
446
- def remove_formula_lines(lines, formula_text):
447
- if not formula_text:
448
- return lines
449
- formula_lines = {fix_pdf_text(x.strip()) for x in formula_text.splitlines() if x.strip()}
450
- return [x for x in lines if fix_pdf_text(x.strip()) not in formula_lines]
451
-
452
-
453
- def build_business_meaning(audience, kpi_name, measure_name):
454
- base_name = fix_pdf_text(measure_name or kpi_name or 'This KPI')
455
- if audience == 'Leadership':
456
- return f"{base_name} helps leadership monitor performance and coverage trends for decision-making."
457
- if audience == 'Analytics User':
458
- return f"{base_name} is used in reporting and should be interpreted with source logic, filters, and exclusions."
459
- return f"{base_name} helps business users understand what is being tracked and why it matters."
460
-
461
-
462
- def parse_doc_entry(doc, audience, match_info=None, forced_kpi_name=None, excel_mapping=None, query_text=None):
463
- context = fix_pdf_text(doc.page_content)
464
- lines = normalize_lines(context)
465
- formula = extract_formula(lines)
466
- non_formula_lines = remove_formula_lines(lines, formula)
467
-
468
- kpi_name = extract_named_field(non_formula_lines, ['Name'])
469
- measure_name = extract_named_field(non_formula_lines, ['Measure name in the PBI', 'Measure Name'])
470
- if forced_kpi_name and (not kpi_name or normalize_loose(kpi_name) == 'not found'):
471
- kpi_name = forced_kpi_name
472
-
473
- definition = extract_label_block(non_formula_lines, ['Description', 'Definition'])
474
- if not definition:
475
- heur = []
476
- for line in non_formula_lines:
477
- low = line.lower()
478
- if any(x in low for x in ['number of', 'count of', 'unique', '%', 'percent', 'rate of', 'ratio of', 'calculated as']):
479
- heur.append(fix_pdf_text(line))
480
- definition = ' '.join(heur[:3]).strip() or 'Definition not found clearly in the source extract.'
481
- if not formula:
482
- formula = 'Formula not found in source extract.'
483
-
484
- excel_info = lookup_kpi_excel_info(kpi_name, measure_name, excel_mapping or {}, query_text=query_text)
485
- report_sources = excel_info.get('report_sources', []) if excel_info else []
486
- extra_excel_info = excel_info.get('extra_info', {}) if excel_info else {}
487
- matched_rows = excel_info.get('row_ids', []) if excel_info else []
488
-
489
- notes = []
490
- if kpi_name:
491
- notes.append(f"**KPI Name:** {fix_pdf_text(kpi_name)}")
492
- if measure_name:
493
- notes.append(f"**Power BI Measure:** {fix_pdf_text(measure_name)}")
494
- if report_sources:
495
- notes.append(f"**Report / Offering Presence (Yes columns):** {', '.join(report_sources)}")
496
- if matched_rows:
497
- notes.append(f"**Matched Excel Row Count:** {len(matched_rows)}")
498
- if extra_excel_info.get('Placement in Offering'):
499
- notes.append(f"**Placement in Offering:** {extra_excel_info['Placement in Offering']}")
500
- if extra_excel_info.get('Calculated at:'):
501
- notes.append(f"**Calculated at:** {extra_excel_info['Calculated at:']}")
502
- if extra_excel_info.get('Domain'):
503
- notes.append(f"**Domain:** {extra_excel_info['Domain']}")
504
- if extra_excel_info.get('Interaction'):
505
- notes.append(f"**Interaction:** {extra_excel_info['Interaction']}")
506
- if extra_excel_info.get('Channels'):
507
- notes.append(f"**Channels:** {extra_excel_info['Channels']}")
508
- if doc.metadata.get('page') is not None:
509
- notes.append(f"**Page:** {doc.metadata['page'] + 1}")
510
- if match_info:
511
- notes.append(f"**Primary Search Match:** {match_info}")
512
-
513
- return {
514
- 'doc': doc,
515
- 'page': doc.metadata.get('page'),
516
- 'context': context,
517
- 'kpi_name': fix_pdf_text(kpi_name) or 'Not found',
518
- 'measure_name': fix_pdf_text(measure_name) or 'Not found',
519
- 'definition': fix_pdf_text(definition),
520
- 'business': build_business_meaning(audience, kpi_name, measure_name),
521
- 'formula': fix_pdf_text(formula),
522
- 'notes': '\n\n'.join(notes) if notes else 'No additional notes found.',
523
- 'report_sources': report_sources,
524
- 'excel_info': extra_excel_info,
525
- }
526
-
527
-
528
- PARSED_CHUNKS = [parse_doc_entry(doc, 'Business User') for doc in chunk_docs]
529
-
530
-
531
- def entry_key(entry):
532
- return (
533
- normalize_exact(entry['kpi_name']),
534
- normalize_exact(entry['measure_name']),
535
- normalize_exact(entry['context'][:300]),
536
- )
537
-
538
-
539
- def build_indices(entries):
540
- kpi_exact_index, measure_exact_index, kpi_loose_index, measure_loose_index = {}, {}, {}, {}
541
- seen = set()
542
- for entry in entries:
543
- key = entry_key(entry)
544
- if key in seen:
545
- continue
546
- seen.add(key)
547
- nk_exact = normalize_exact(entry['kpi_name'])
548
- nm_exact = normalize_exact(entry['measure_name'])
549
- nk_loose = normalize_loose(entry['kpi_name'])
550
- nm_loose = normalize_loose(entry['measure_name'])
551
- if nk_exact and nk_exact != 'not found':
552
- kpi_exact_index.setdefault(nk_exact, []).append(entry)
553
- if nm_exact and nm_exact != 'not found':
554
- measure_exact_index.setdefault(nm_exact, []).append(entry)
555
- if nk_loose and nk_loose != 'not found':
556
- kpi_loose_index.setdefault(nk_loose, []).append(entry)
557
- if nm_loose and nm_loose != 'not found':
558
- measure_loose_index.setdefault(nm_loose, []).append(entry)
559
- return kpi_exact_index, measure_exact_index, kpi_loose_index, measure_loose_index
560
-
561
-
562
- EXACT_KPI_INDEX, EXACT_MEASURE_INDEX, LOOSE_KPI_INDEX, LOOSE_MEASURE_INDEX = build_indices(PARSED_CHUNKS)
563
- ALL_LOOSE_KPI_NAMES = sorted(LOOSE_KPI_INDEX.keys())
564
- ALL_LOOSE_MEASURE_NAMES = sorted(LOOSE_MEASURE_INDEX.keys())
565
-
566
-
567
- def token_overlap_score(query_text: str, candidate_text: str):
568
- q_tokens = significant_tokens(query_text)
569
- c_tokens = significant_tokens(candidate_text)
570
- if not q_tokens or not c_tokens:
571
- return 0.0, 0, 0
572
- q_set, c_set = set(q_tokens), set(c_tokens)
573
- overlap = q_set & c_set
574
- coverage = len(overlap) / max(len(q_set), 1)
575
- return coverage, len(overlap), len(c_set)
576
-
577
-
578
- def find_best_exact_like_name(query_text: str):
579
- q_exact = normalize_exact(query_text)
580
- q_loose = normalize_loose(query_text)
581
- if not q_loose:
582
- return None, None
583
- if q_exact in EXACT_KPI_INDEX:
584
- return 'kpi_exact', q_exact
585
- if q_exact in EXACT_MEASURE_INDEX:
586
- return 'measure_exact', q_exact
587
- if q_loose in LOOSE_KPI_INDEX:
588
- return 'kpi_loose', q_loose
589
- if q_loose in LOOSE_MEASURE_INDEX:
590
- return 'measure_loose', q_loose
591
-
592
- best, best_score = None, -1.0
593
- for name in ALL_LOOSE_KPI_NAMES:
594
- coverage, overlap_count, candidate_size = token_overlap_score(q_loose, name)
595
- if coverage == 1.0 and overlap_count >= 2:
596
- score = overlap_count * 10 - max(candidate_size - overlap_count, 0)
597
- if score > best_score:
598
- best_score, best = score, ('kpi_loose', name)
599
- for name in ALL_LOOSE_MEASURE_NAMES:
600
- coverage, overlap_count, candidate_size = token_overlap_score(q_loose, name)
601
- if coverage == 1.0 and overlap_count >= 2:
602
- score = overlap_count * 10 - max(candidate_size - overlap_count, 0)
603
- if score > best_score:
604
- best_score, best = score, ('measure_loose', name)
605
- return best if best else (None, None)
606
-
607
-
608
- def doc_contains_exact_text(doc, search_text: str) -> bool:
609
- return normalize_loose(search_text) in normalize_loose(doc.page_content)
610
-
611
 
612
  # =========================================================
613
- # 4) SEARCH
614
  # =========================================================
615
- def choose_primary_entry(query: str, audience: str, excel_mapping=None):
616
- cleaned_query = clean_user_query(query)
617
- if not cleaned_query:
618
- return None, None
619
- resolved_query, _, canonical_term = resolve_alias(query)
620
- effective_query = canonical_term if canonical_term else resolved_query
621
- match_type, canonical_name = find_best_exact_like_name(effective_query)
622
-
623
- if match_type == 'kpi_exact':
624
- chosen = EXACT_KPI_INDEX[canonical_name][0]
625
- return parse_doc_entry(chosen['doc'], audience, match_info='Exact KPI name match', excel_mapping=excel_mapping, query_text=effective_query), 100.0
626
- if match_type == 'measure_exact':
627
- chosen = EXACT_MEASURE_INDEX[canonical_name][0]
628
- return parse_doc_entry(chosen['doc'], audience, match_info='Exact PBI measure match', excel_mapping=excel_mapping, query_text=effective_query), 95.0
629
- if match_type == 'kpi_loose':
630
- chosen = LOOSE_KPI_INDEX[canonical_name][0]
631
- return parse_doc_entry(chosen['doc'], audience, match_info='Normalized KPI name match', excel_mapping=excel_mapping, query_text=effective_query), 90.0
632
- if match_type == 'measure_loose':
633
- chosen = LOOSE_MEASURE_INDEX[canonical_name][0]
634
- return parse_doc_entry(chosen['doc'], audience, match_info='Normalized PBI measure match', excel_mapping=excel_mapping, query_text=effective_query), 88.0
635
-
636
- raw_chunk_hits = [doc for doc in chunk_docs if doc_contains_exact_text(doc, effective_query)]
637
- if raw_chunk_hits:
638
- chosen_doc = raw_chunk_hits[0]
639
- return parse_doc_entry(chosen_doc, audience, match_info='Exact raw text found in PDF chunk', forced_kpi_name=effective_query, excel_mapping=excel_mapping, query_text=effective_query), 75.0
640
-
641
- raw_page_hits = [doc for doc in page_docs if doc_contains_exact_text(doc, effective_query)]
642
- if raw_page_hits:
643
- chosen_doc = raw_page_hits[0]
644
- return parse_doc_entry(chosen_doc, audience, match_info='Exact raw text found in PDF page', forced_kpi_name=effective_query, excel_mapping=excel_mapping, query_text=effective_query), 70.0
645
- return None, None
646
 
 
647
 
648
- def find_second_same_occurrence(primary_entry, audience: str, excel_mapping=None):
649
- target_name_loose = normalize_loose(primary_entry['kpi_name'])
650
- if not target_name_loose or target_name_loose == 'not found':
651
- return None
652
- primary_context = normalize_exact(primary_entry['context'][:400])
653
 
654
- if target_name_loose in LOOSE_KPI_INDEX:
655
- candidates = [e for e in LOOSE_KPI_INDEX[target_name_loose] if normalize_exact(e['context'][:400]) != primary_context]
656
- if candidates:
657
- candidates.sort(key=lambda e: (e['page'] if e['page'] is not None else 99999))
658
- return parse_doc_entry(candidates[0]['doc'], audience, excel_mapping=excel_mapping, query_text=primary_entry['kpi_name'])
659
-
660
- for doc in chunk_docs:
661
- if target_name_loose in normalize_loose(doc.page_content) and normalize_exact(doc.page_content[:400]) != primary_context:
662
- return parse_doc_entry(doc, audience, forced_kpi_name=primary_entry['kpi_name'], excel_mapping=excel_mapping, query_text=primary_entry['kpi_name'])
663
- for doc in page_docs:
664
- if target_name_loose in normalize_loose(doc.page_content) and normalize_exact(doc.page_content[:400]) != primary_context:
665
- return parse_doc_entry(doc, audience, forced_kpi_name=primary_entry['kpi_name'], excel_mapping=excel_mapping, query_text=primary_entry['kpi_name'])
666
- return None
667
 
 
 
 
668
 
669
  # =========================================================
670
- # 5) UI HELPERS
671
  # =========================================================
672
- def compare_same(value1, value2, formula=False):
673
- return clean_formula_text(value1) == clean_formula_text(value2) if formula else normalize_loose(value1) == normalize_loose(value2)
674
-
675
-
676
- def render_badges(sources):
677
- if not sources:
678
- return "<span class='pill neutral'>Not mapped</span>"
679
- colors = ['info', 'success', 'warning', 'neutral']
680
- pills = []
681
- for i, src in enumerate(sources):
682
- color = colors[i % len(colors)]
683
- pills.append(f"<span class='pill {color}'>{html_escape(src)}</span>")
684
- return ' '.join(pills)
685
-
686
-
687
- def field_diff_html(left_text, right_text, formula=False):
688
- left_text = fix_pdf_text(left_text or '')
689
- right_text = fix_pdf_text(right_text or '')
690
- if compare_same(left_text, right_text, formula=formula):
691
- return "<div class='diff-box same'>No difference. Both occurrences match for this field.</div>"
692
- left_lines = [ln for ln in left_text.splitlines() if ln.strip()] or ['Not found']
693
- right_lines = [ln for ln in right_text.splitlines() if ln.strip()] or ['Not found']
694
- removed = [x for x in left_lines if x not in right_lines]
695
- added = [x for x in right_lines if x not in left_lines]
696
- removed_html = ''.join(f"<li>{html_escape(line)}</li>" for line in removed[:12]) or '<li>No unique lines found.</li>'
697
- added_html = ''.join(f"<li>{html_escape(line)}</li>" for line in added[:12]) or '<li>No unique lines found.</li>'
698
- return f"""
699
- <div class='diff-box different'>
700
- <div class='diff-title'>What differs</div>
701
- <div class='diff-grid'>
702
- <div class='diff-col'><div class='diff-col-title'>Only in Occurrence 1</div><ul>{removed_html}</ul></div>
703
- <div class='diff-col'><div class='diff-col-title'>Only in Occurrence 2</div><ul>{added_html}</ul></div>
704
- </div>
705
- </div>
706
- """
707
 
 
 
708
 
709
- def build_summary_cards(entry1, entry2=None, retrieval_score=None):
710
- def badge(text, kind='default'):
711
- return f"<span class='pill {kind}'>{html_escape(text)}</span>"
712
-
713
- page1 = f"Page {entry1['page'] + 1}" if entry1 and entry1['page'] is not None else 'Page not found'
714
- report_badges = render_badges(entry1.get('report_sources', []))
715
-
716
- cards = [
717
- f"<div class='summary-card'><div class='summary-label'>KPI Name</div><div class='summary-value'>{html_escape(entry1['kpi_name'])}</div><div class='summary-sub'>{badge(page1, 'info')}</div></div>",
718
- f"<div class='summary-card'><div class='summary-label'>PBI Measure</div><div class='summary-value'>{html_escape(entry1['measure_name'])}</div><div class='summary-sub'>{badge('Primary result', 'success')}</div></div>",
719
- f"<div class='summary-card'><div class='summary-label'>Report / Offering</div><div class='summary-value badge-wrap'>{report_badges}</div><div class='summary-sub'>{badge('Yes columns from Excel', 'neutral')}</div></div>",
720
- ]
721
-
722
- compare_hint = 'One occurrence found'
723
- compare_kind = 'neutral'
724
- if entry2:
725
- same_all = (
726
- compare_same(entry1['kpi_name'], entry2['kpi_name']) and
727
- compare_same(entry1['measure_name'], entry2['measure_name']) and
728
- compare_same(entry1['definition'], entry2['definition']) and
729
- compare_same(entry1['formula'], entry2['formula'], formula=True)
730
- )
731
- compare_hint = 'Exact name match found' if same_all else 'Exact name match found (differences detected)'
732
- compare_kind = 'success' if same_all else 'warning'
733
-
734
- checked_text = '2 exact-name matches checked' if entry2 else 'No second exact-name match'
735
- if retrieval_score is not None:
736
- checked_text = f"search score {retrieval_score:.1f}"
737
 
738
- cards.append(
739
- f"<div class='summary-card'><div class='summary-label'>Comparison Status</div><div class='summary-value'>{html_escape(compare_hint)}</div><div class='summary-sub'>{badge(checked_text, compare_kind)}</div></div>"
740
- )
741
- return "<div class='summary-grid'>" + ''.join(cards) + "</div>"
742
-
743
-
744
- def build_side_by_side_comparison(entry1, entry2):
745
- if not entry1 and not entry2:
746
- return "<div class='empty-state'>No relevant KPI entry found.</div>"
747
- if entry1 and not entry2:
748
- page_text = f"Page {entry1['page'] + 1}" if entry1['page'] is not None else 'Unknown page'
749
- kpi_text = html_escape(entry1['kpi_name'])
750
- return f"<div class='compare-wrap single'><div class='compare-banner neutral'>Primary result shown for <b>{kpi_text}</b> ({html_escape(page_text)}). No second occurrence with the <b>exact same KPI name</b> was found.</div></div>"
751
-
752
- same_all = (
753
- compare_same(entry1['kpi_name'], entry2['kpi_name']) and
754
- compare_same(entry1['measure_name'], entry2['measure_name']) and
755
- compare_same(entry1['definition'], entry2['definition']) and
756
- compare_same(entry1['formula'], entry2['formula'], formula=True)
757
- )
758
- overall_class = 'success' if same_all else 'warning'
759
- overall_text = 'Exact same KPI name found in two places' if same_all else 'Exact same KPI name found in two places, but details differ'
760
- page1 = f"Page {entry1['page'] + 1}" if entry1['page'] is not None else 'Unknown'
761
- page2 = f"Page {entry2['page'] + 1}" if entry2['page'] is not None else 'Unknown'
762
-
763
- rows = []
764
- fields = [
765
- ('KPI Name', entry1['kpi_name'], entry2['kpi_name'], False),
766
- ('Power BI Measure', entry1['measure_name'], entry2['measure_name'], False),
767
- ('Definition', entry1['definition'], entry2['definition'], False),
768
- ('Formula', entry1['formula'], entry2['formula'], True),
769
- ]
770
- for label, left_val, right_val, is_formula in fields:
771
- left_val, right_val = fix_pdf_text(left_val or 'Not found'), fix_pdf_text(right_val or 'Not found')
772
- status = 'same' if compare_same(left_val, right_val, formula=is_formula) else 'different'
773
- diff_panel = field_diff_html(left_val, right_val, formula=is_formula)
774
- code_class = 'code-block' if is_formula else ''
775
- rows.append(f"""
776
- <div class='compare-row {status}'>
777
- <div class='compare-field'><div class='field-name'>{html_escape(label)}</div><div class='field-status {status}'>{'SAME' if status == 'same' else 'DIFFERENT'}</div></div>
778
- <div class='compare-cell'><div class='cell-title'>Occurrence 1</div><div class='cell-content {code_class}'>{nl2br(left_val)}</div></div>
779
- <div class='compare-cell'><div class='cell-title'>Occurrence 2</div><div class='cell-content {code_class}'>{nl2br(right_val)}</div></div>
780
- </div>
781
- <div class='diff-row'>{diff_panel}</div>
782
- """)
783
- return f"""
784
- <div class='compare-wrap'>
785
- <div class='compare-banner {overall_class}'>{html_escape(overall_text)}</div>
786
- <div class='compare-head'>
787
- <div class='head-card'><div class='head-label'>Occurrence 1</div><div class='head-page'>{html_escape(page1)}</div><div class='head-name'>{html_escape(entry1['kpi_name'])}</div></div>
788
- <div class='head-card'><div class='head-label'>Occurrence 2</div><div class='head-page'>{html_escape(page2)}</div><div class='head-name'>{html_escape(entry2['kpi_name'])}</div></div>
789
- </div>
790
- <div class='compare-table'>{''.join(rows)}</div>
791
- </div>
792
- """
793
 
 
794
 
795
  # =========================================================
796
- # 6) FEEDBACK FLOW
797
  # =========================================================
798
- def run_search_and_prepare_feedback(question, audience, excel_mapping):
799
- results = get_answer(question, audience, excel_mapping=excel_mapping)
800
- current_kpi_name = ''
801
- if isinstance(results, tuple) and len(results) >= 5:
802
- current_kpi_name = extract_kpi_name_from_notes(results[4] or '')
803
- return results + (
804
- current_kpi_name,
805
- gr.update(visible=True), gr.update(value=None, visible=True),
806
- gr.update(visible=False), gr.update(value=None), gr.update(value='', visible=False),
807
- gr.update(visible=False), gr.update(value=''), gr.update(visible=False), gr.update(value=None),
808
- gr.update(value='', visible=False), gr.update(value='', visible=False),
809
- )
810
-
811
-
812
- def clear_feedback_only():
813
- return (
814
- gr.update(visible=False), gr.update(value=None, visible=False),
815
- gr.update(visible=False), gr.update(value=None), gr.update(value='', visible=False),
816
- gr.update(visible=False), gr.update(value=''), gr.update(visible=False), gr.update(value=None),
817
- gr.update(value='', visible=False), gr.update(value='', visible=False),
818
- )
819
-
820
-
821
- def on_satisfaction_change(choice):
822
- if choice == 'Yes':
823
- return (
824
- gr.update(visible=True), gr.update(visible=False), gr.update(visible=False),
825
- gr.update(value='', visible=False), gr.update(value='Please rate the definition from 1 to 5.', visible=True),
826
- )
827
- if choice == 'No':
828
- return (
829
- gr.update(visible=False), gr.update(visible=True), gr.update(visible=False),
830
- gr.update(value='', visible=False), gr.update(value='Please ask more so the app can try again.', visible=True),
831
- )
832
- return (
833
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
834
- gr.update(value='', visible=False), gr.update(value='', visible=False),
835
- )
836
-
837
-
838
- def submit_rating(rating):
839
- if rating is None:
840
- return gr.update(value='Please select a rating from 1 to 5.', visible=True)
841
- return gr.update(value=f"Thanks for the feedback. You rated the definition **{rating}/5**.", visible=True)
842
-
843
-
844
- def run_followup_search(followup_question, audience, current_kpi_name, excel_mapping):
845
- if not followup_question or not followup_question.strip():
846
- return (
847
- gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
848
- gr.update(value=current_kpi_name), gr.update(visible=True), gr.update(value='No', visible=True),
849
- gr.update(visible=False), gr.update(value=None), gr.update(value='', visible=False),
850
- gr.update(visible=True), gr.update(value=''), gr.update(visible=True), gr.update(value=None),
851
- gr.update(value='Please type a follow-up question before submitting.', visible=True), gr.update(value='', visible=False),
852
- )
853
-
854
- effective_followup = current_kpi_name if current_kpi_name and is_generic_followup_question(followup_question) else followup_question
855
- used_context = effective_followup != followup_question
856
- results = get_answer(effective_followup, audience, excel_mapping=excel_mapping)
857
- new_current_kpi = current_kpi_name or ''
858
- if isinstance(results, tuple) and len(results) >= 5:
859
- extracted = extract_kpi_name_from_notes(results[4] or '')
860
- if extracted:
861
- new_current_kpi = extracted
862
- helper_message = 'If you are still not satisfied, choose below to raise an incident.'
863
- if used_context and current_kpi_name:
864
- helper_message = f"Used KPI context from the previous result: **{current_kpi_name}**. If you are still not satisfied, choose below to raise an incident."
865
- return results + (
866
- new_current_kpi, gr.update(visible=True), gr.update(value='No', visible=True),
867
- gr.update(visible=False), gr.update(value=None), gr.update(value='', visible=False),
868
- gr.update(visible=True), gr.update(value=followup_question), gr.update(visible=True), gr.update(value=None),
869
- gr.update(value=helper_message, visible=True), gr.update(value='', visible=False),
870
- )
871
-
872
-
873
- def on_still_not_satisfied_change(choice):
874
- if choice == 'Yes':
875
- html = f"<div class='incident-box'><div class='incident-title'>Still not satisfied?</div><div class='incident-text'>You can raise an incident in ServiceNow for further help.</div><a class='incident-link' href='{html_escape(SERVICENOW_INCIDENT_URL)}' target='_blank' rel='noopener noreferrer'>Raise Incident in ServiceNow</a></div>"
876
- return gr.update(value=html, visible=True), gr.update(value='You selected to raise an incident for further support.', visible=True)
877
- if choice == 'No':
878
- return gr.update(value='', visible=False), gr.update(value='Glad the follow-up helped.', visible=True)
879
- return gr.update(value='', visible=False), gr.update(value='', visible=False)
880
 
881
-
882
- # =========================================================
883
- # 7) MAIN ANSWER
884
- # =========================================================
885
  def get_answer(question, audience, excel_mapping=None):
886
- if not question or not question.strip():
887
- return ('<div class="empty-state">Ask a KPI question to see the summary cards.</div>', 'Please enter a KPI question.', '', '', '', '<div class="empty-state">No comparison available.</div>')
888
 
889
- primary_entry, best_score = choose_primary_entry(question, audience, excel_mapping=excel_mapping)
890
- if primary_entry is None:
891
- workbook_note = DEFAULT_KPI_EXCEL if Path(DEFAULT_KPI_EXCEL).exists() else f"{DEFAULT_KPI_EXCEL} not found next to the app file"
892
  return (
893
- '<div class="empty-state">No KPI found. The app auto-loads the KPI Glossary Excel and should print the Yes columns for the matching KPI row, but this KPI could not be matched safely.</div>',
894
- 'No KPI found for the searched text.', '', '',
895
- f"**Search Tried:** `{fix_pdf_text(clean_user_query(question))}`\n\n**Excel Auto-load:** {workbook_note}\n\nIf the KPI text is present visually in the PDF but still not found, the PDF extraction may be breaking the text across lines/chunks.",
896
- '<div class="empty-state">No comparison available because the primary KPI was not found.</div>',
 
 
897
  )
898
 
899
- second_entry = find_second_same_occurrence(primary_entry, audience, excel_mapping=excel_mapping)
900
- summary_html = build_summary_cards(primary_entry, second_entry, retrieval_score=best_score)
901
- comparison_html = build_side_by_side_comparison(primary_entry, second_entry)
902
- return summary_html, primary_entry['definition'], primary_entry['business'], primary_entry['formula'], primary_entry['notes'], comparison_html
903
-
904
-
905
- def clear_all(default_mapping):
906
- return (
907
- '', 'Business User', '<div class="empty-state">Ask a KPI question to see the summary cards.</div>',
908
- '', '', '', '', '<div class="empty-state">Comparison results will appear here.</div>',
909
- default_mapping, '', *clear_feedback_only(),
910
- )
911
-
912
-
913
- # =========================================================
914
- # 8) UI
915
- # =========================================================
916
- CUSTOM_CSS = """
917
- <style>
918
- :root {
919
- --bg1: #f6f8ff; --bg2: #fafdff; --bg3: #eef4ff; --card: rgba(255,255,255,0.82);
920
- --card-strong: rgba(255,255,255,0.94); --stroke: rgba(99, 102, 241, 0.14); --text: #14213d;
921
- --muted: #667085; --primary: #5b5bd6; --primary-2: #7c4dff; --success-bg: #ecfdf3;
922
- --success-text: #067647; --warning-bg: #fff7ed; --warning-text: #c2410c; --neutral-bg: #f8fafc;
923
- --neutral-text: #475467; --shadow: 0 18px 40px rgba(34, 55, 110, 0.10);
924
- }
925
- body, .gradio-container { background: linear-gradient(135deg, var(--bg1) 0%, var(--bg2) 45%, var(--bg3) 100%) !important; }
926
- .gradio-container { max-width: 1500px !important; padding-top: 18px !important; }
927
- .hero { background: linear-gradient(135deg, rgba(91,91,214,0.14), rgba(124,77,255,0.08), rgba(59,130,246,0.06)); border: 1px solid rgba(124,77,255,0.14); box-shadow: var(--shadow); border-radius: 26px; padding: 26px 30px; margin-bottom: 18px; backdrop-filter: blur(10px); }
928
- .hero-title { font-size: 34px; font-weight: 800; color: var(--text); margin: 0 0 8px 0; }
929
- .hero-subtitle { font-size: 15px; color: var(--muted); margin: 0; line-height: 1.65; }
930
- .panel { background: var(--card) !important; border: 1px solid var(--stroke) !important; border-radius: 22px !important; box-shadow: var(--shadow) !important; padding: 16px !important; backdrop-filter: blur(12px); }
931
- textarea, input, .gr-textbox, .gr-dropdown, .gr-radio { border-radius: 16px !important; }
932
- button.primary, button[class*='primary'] { background: linear-gradient(135deg, var(--primary), var(--primary-2)) !important; border: none !important; color: white !important; border-radius: 16px !important; box-shadow: 0 10px 22px rgba(91,91,214,0.22) !important; }
933
- button.secondary { border-radius: 16px !important; }
934
- button[role='tab'][aria-selected='true'] { color: var(--primary) !important; border-bottom: 3px solid var(--primary) !important; }
935
- .kpi-note { background: rgba(255,255,255,0.68); border: 1px dashed rgba(91,91,214,0.18); border-radius: 16px; padding: 12px 14px; color: var(--muted); font-size: 13px; margin-top: 8px; }
936
- .summary-grid { display: grid; grid-template-columns: repeat(5, minmax(0, 1fr)); gap: 14px; margin-bottom: 16px; }
937
- .summary-card { background: linear-gradient(180deg, var(--card-strong), rgba(255,255,255,0.72)); border: 1px solid rgba(91,91,214,0.12); border-radius: 20px; padding: 16px; box-shadow: 0 12px 28px rgba(56,72,122,0.08); min-height: 122px; }
938
- .summary-label { color: var(--muted); font-size: 12px; font-weight: 700; letter-spacing: .04em; text-transform: uppercase; margin-bottom: 10px; }
939
- .summary-value { color: var(--text); font-size: 20px; font-weight: 800; line-height: 1.25; word-break: break-word; }
940
- .summary-sub { margin-top: 14px; }
941
- .badge-wrap { display:flex; flex-wrap:wrap; gap:8px; align-items:flex-start; }
942
- .pill { display:inline-flex; align-items:center; gap:6px; padding:7px 11px; border-radius:999px; font-size:12px; font-weight:700; }
943
- .pill.info { background: rgba(59,130,246,0.12); color:#1d4ed8; }
944
- .pill.success { background: rgba(16,185,129,0.14); color:#047857; }
945
- .pill.warning { background: rgba(245,158,11,0.16); color:#b45309; }
946
- .pill.neutral { background: rgba(100,116,139,0.12); color:#475467; }
947
- .compare-wrap { display:flex; flex-direction:column; gap:14px; }
948
- .compare-banner { padding:14px 16px; border-radius:16px; font-weight:800; font-size:14px; border:1px solid transparent; }
949
- .compare-banner.success { background: var(--success-bg); color: var(--success-text); }
950
- .compare-banner.warning { background: var(--warning-bg); color: var(--warning-text); }
951
- .compare-banner.neutral { background: var(--neutral-bg); color: var(--neutral-text); }
952
- .compare-head { display:grid; grid-template-columns: repeat(2, minmax(0,1fr)); gap:14px; }
953
- .head-card { background: rgba(255,255,255,0.82); border:1px solid rgba(99,102,241,0.12); border-radius:18px; padding:16px; }
954
- .head-label { color: var(--muted); font-size:12px; font-weight:700; text-transform:uppercase; letter-spacing:.04em; }
955
- .head-page { color: var(--primary); font-size:13px; font-weight:700; margin-top:6px; }
956
- .head-name { color: var(--text); font-size:18px; font-weight:800; margin-top:8px; }
957
- .compare-table { display:flex; flex-direction:column; gap:12px; }
958
- .compare-row { display:grid; grid-template-columns:220px 1fr 1fr; gap:12px; align-items:stretch; }
959
- .compare-field, .compare-cell { background: rgba(255,255,255,0.82); border:1px solid rgba(99,102,241,0.10); border-radius:18px; padding:14px; }
960
- .compare-row.same .compare-field { background: linear-gradient(180deg, #f0fdf4, #ffffff); }
961
- .compare-row.different .compare-field { background: linear-gradient(180deg, #fff7ed, #ffffff); }
962
- .field-name { color: var(--text); font-weight:800; font-size:15px; }
963
- .field-status { display:inline-block; margin-top:12px; padding:6px 10px; border-radius:999px; font-size:11px; font-weight:800; letter-spacing:.05em; }
964
- .field-status.same { background: rgba(16,185,129,0.14); color:#047857; }
965
- .field-status.different { background: rgba(245,158,11,0.16); color:#b45309; }
966
- .cell-title { color: var(--muted); font-size:12px; font-weight:700; text-transform:uppercase; letter-spacing:.04em; margin-bottom:8px; }
967
- .cell-content { color: var(--text); font-size:14px; line-height:1.6; white-space:normal; word-break:break-word; }
968
- .code-block { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', monospace; background:#f8fafc; border:1px solid rgba(148,163,184,0.16); border-radius:14px; padding:12px; white-space:pre-wrap; }
969
- .diff-box { background: rgba(255,255,255,0.76); border:1px solid rgba(99,102,241,0.10); border-radius:18px; padding:14px; }
970
- .diff-box.same { color:#047857; background: rgba(236,253,243,0.82); }
971
- .diff-box.different { background: rgba(255,247,237,0.78); }
972
- .diff-title { font-size:13px; font-weight:800; color: var(--text); margin-bottom:10px; }
973
- .diff-grid { display:grid; grid-template-columns: repeat(2, minmax(0,1fr)); gap:12px; }
974
- .diff-col { background: rgba(255,255,255,0.85); border-radius:14px; padding:12px; border:1px dashed rgba(99,102,241,0.12); }
975
- .diff-col-title { font-size:12px; font-weight:800; color: var(--muted); margin-bottom:8px; text-transform:uppercase; }
976
- .diff-col ul { margin:0; padding-left:18px; }
977
- .diff-col li { margin:6px 0; color: var(--text); font-size:13px; }
978
- .feedback-box { background: rgba(255,255,255,0.76); border:1px solid rgba(99,102,241,0.10); border-radius:18px; padding:16px; margin-top:14px; }
979
- .feedback-title { font-size:16px; font-weight:800; color: var(--text); margin-bottom:8px; }
980
- .incident-box { background: rgba(255,247,237,0.78); border:1px solid rgba(245,158,11,0.22); border-radius:16px; padding:14px; margin-top:10px; }
981
- .incident-title { font-weight:800; color:#9a3412; margin-bottom:6px; }
982
- .incident-text { color:#7c2d12; margin-bottom:10px; }
983
- .incident-link { display:inline-block; padding:10px 14px; border-radius:12px; background:#7c3aed; color:white !important; text-decoration:none; font-weight:700; }
984
- .empty-state { background: rgba(255,255,255,0.74); border:1px dashed rgba(91,91,214,0.20); border-radius:18px; padding:18px; color: var(--muted); }
985
- @media (max-width:1300px){ .summary-grid{grid-template-columns:repeat(3,minmax(0,1fr));} }
986
- @media (max-width:1100px){ .summary-grid{grid-template-columns:repeat(2,minmax(0,1fr));} .compare-row{grid-template-columns:1fr;} .compare-head{grid-template-columns:1fr;} .diff-grid{grid-template-columns:1fr;} }
987
- @media (max-width:700px){ .summary-grid{grid-template-columns:1fr;} }
988
- </style>
989
- """
990
 
991
- DEFAULT_MAPPING = load_default_excel_if_present()
992
- DEFAULT_STATUS = (
993
- f"Auto-loaded Excel: {DEFAULT_KPI_EXCEL} | mapped KPI keys: {len(DEFAULT_MAPPING)}" if Path(DEFAULT_KPI_EXCEL).exists() else
994
- f"Auto-load Excel not found: place '{DEFAULT_KPI_EXCEL}' next to app.py"
995
- )
996
-
997
- with gr.Blocks() as demo:
998
- gr.HTML(CUSTOM_CSS)
999
- gr.HTML("""
1000
- <div class='hero'>
1001
- <div class='hero-title'>💊 Pharma KPI Copilot</div>
1002
- <p class='hero-subtitle'>
1003
- This version auto-loads the KPI Glossary Excel from the same folder as <b>app.py</b>. When you type a KPI name, the app searches Excel and shows the report / offering names as colored badges based on the columns where that KPI row is marked <b>Yes</b>. It also reads the PDF for definition, formula, and notes.
1004
- </p>
1005
- </div>
1006
- """)
1007
-
1008
- with gr.Row():
1009
- with gr.Column(scale=4, elem_classes=['panel']):
1010
- question = gr.Textbox(label='Ask KPI question', placeholder='e.g. OCCP Interactions', lines=2)
1011
- audience = gr.Dropdown(choices=['Business User', 'Analytics User', 'Leadership'], value='Business User', label='Explain for')
1012
- excel_status = gr.Markdown(DEFAULT_STATUS)
1013
- submit_btn = gr.Button('Submit', variant='primary')
1014
- clear_btn = gr.Button('Clear')
1015
- gr.HTML("<div class='kpi-note'><b>Auto-load rule:</b> keep the Excel workbook named <b>CIA Consolidated KPIs_MetricsGovernance (1).xlsx</b> in the same folder as <b>app.py</b>. The app will search the KPI in Excel and show report names where the KPI row has <b>Yes</b>.</div>")
1016
-
1017
- with gr.Column(scale=8, elem_classes=['panel']):
1018
- summary_cards = gr.HTML('<div class="empty-state">Ask a KPI question to see the summary cards.</div>')
1019
- with gr.Tab('Definition'):
1020
- definition = gr.Markdown()
1021
- with gr.Tab('Business Meaning'):
1022
- business = gr.Markdown()
1023
- with gr.Tab('Formula'):
1024
- formula = gr.Textbox(label='Formula', lines=14)
1025
- with gr.Tab('Notes'):
1026
- notes = gr.Markdown()
1027
- with gr.Tab('Comparison'):
1028
- comparison = gr.HTML('<div class="empty-state">Comparison results will appear here.</div>')
1029
-
1030
- excel_mapping_state = gr.State(DEFAULT_MAPPING)
1031
- current_kpi_state = gr.State('')
1032
-
1033
- with gr.Group(visible=False) as feedback_panel:
1034
- gr.HTML("<div class='feedback-box'><div class='feedback-title'>Are you satisfied with the definition?</div></div>")
1035
- satisfied_choice = gr.Radio(choices=['Yes', 'No'], label='Was the definition satisfactory?', visible=True)
1036
- with gr.Row(visible=False) as rating_row:
1037
- rating_value = gr.Radio(choices=['1', '2', '3', '4', '5'], label='Rate the definition (1 to 5)')
1038
- rating_submit_btn = gr.Button('Submit Rating')
1039
- rating_status = gr.Markdown(visible=False)
1040
- with gr.Column(visible=False) as followup_row:
1041
- followup_question = gr.Textbox(label='Ask more', placeholder='Please ask your follow-up question here', lines=3)
1042
- followup_submit_btn = gr.Button('Ask More', variant='primary')
1043
- with gr.Row(visible=False) as still_not_satisfied_row:
1044
- still_not_satisfied_choice = gr.Radio(choices=['Yes', 'No'], label='Still not satisfied after the follow-up?')
1045
- feedback_status = gr.Markdown(visible=False)
1046
- incident_html = gr.HTML(visible=False)
1047
-
1048
- submit_btn.click(
1049
- fn=run_search_and_prepare_feedback,
1050
- inputs=[question, audience, excel_mapping_state],
1051
- outputs=[
1052
- summary_cards, definition, business, formula, notes, comparison,
1053
- current_kpi_state,
1054
- feedback_panel, satisfied_choice, rating_row, rating_value,
1055
- rating_status, followup_row, followup_question,
1056
- still_not_satisfied_row, still_not_satisfied_choice,
1057
- feedback_status, incident_html,
1058
- ],
1059
- )
1060
-
1061
- satisfied_choice.change(fn=on_satisfaction_change, inputs=[satisfied_choice], outputs=[rating_row, followup_row, still_not_satisfied_row, incident_html, feedback_status])
1062
- rating_submit_btn.click(fn=submit_rating, inputs=[rating_value], outputs=[rating_status])
1063
- followup_submit_btn.click(
1064
- fn=run_followup_search,
1065
- inputs=[followup_question, audience, current_kpi_state, excel_mapping_state],
1066
- outputs=[
1067
- summary_cards, definition, business, formula, notes, comparison,
1068
- current_kpi_state,
1069
- feedback_panel, satisfied_choice, rating_row, rating_value,
1070
- rating_status, followup_row, followup_question,
1071
- still_not_satisfied_row, still_not_satisfied_choice,
1072
- feedback_status, incident_html,
1073
- ],
1074
- )
1075
- still_not_satisfied_choice.change(fn=on_still_not_satisfied_change, inputs=[still_not_satisfied_choice], outputs=[incident_html, feedback_status])
1076
- clear_btn.click(
1077
- fn=clear_all,
1078
- inputs=[excel_mapping_state],
1079
- outputs=[
1080
- question, audience, summary_cards, definition, business, formula, notes, comparison,
1081
- excel_mapping_state, current_kpi_state,
1082
- feedback_panel, satisfied_choice, rating_row, rating_value,
1083
- rating_status, followup_row, followup_question,
1084
- still_not_satisfied_row, still_not_satisfied_choice,
1085
- feedback_status, incident_html,
1086
- ],
1087
- )
1088
-
1089
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
 
 
 
 
3
  from pathlib import Path
4
+ import unicodedata
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  from langchain_community.document_loaders import PyPDFLoader
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
 
11
+ # =========================================================
12
+ # CONFIG
13
+ # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ PDF_FILE = "data.pdf"
16
+ DEFAULT_KPI_EXCEL = "CIA Consolidated KPIs_MetricsGovernance (1).xlsx"
17
 
18
  # =========================================================
19
+ # TEXT HELPERS
20
  # =========================================================
21
+
22
  def fix_pdf_text(text: str) -> str:
23
  if not text:
24
+ return ""
25
+ text = unicodedata.normalize("NFKC", text)
26
+ text = text.lower().strip()
27
+ text = re.sub(r"\s+", " ", text)
 
 
 
 
28
  return text
29
 
 
30
  def normalize_exact(text: str) -> str:
31
+ return fix_pdf_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def clean_user_query(text: str) -> str:
34
+ return fix_pdf_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # =========================================================
37
+ # SAFE EXCEL LOAD
38
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ def load_default_excel_if_present():
41
  try:
42
+ if Path(DEFAULT_KPI_EXCEL).exists():
43
+ df = pd.read_excel(DEFAULT_KPI_EXCEL, engine="openpyxl")
44
+ return df
45
  except Exception as e:
46
+ print("Excel error:", e)
47
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # =========================================================
50
+ # LOAD PDF
51
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ loader = PyPDFLoader(PDF_FILE)
54
+ docs = loader.load()
55
 
56
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
57
+ chunks = splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # =========================================================
60
+ # BUILD SIMPLE INDEX (STRICT)
61
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ EXACT_INDEX = {}
64
 
65
+ for doc in chunks:
66
+ text = normalize_exact(doc.page_content)
 
 
 
67
 
68
+ # take first line as possible KPI name
69
+ first_line = text.split("\n")[0] if "\n" in text else text[:80]
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ key = normalize_exact(first_line)
72
+ if key:
73
+ EXACT_INDEX[key] = doc.page_content
74
 
75
  # =========================================================
76
+ # STRICT SEARCH (FIXED ✅)
77
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ def get_exact_match(query):
80
+ q = normalize_exact(query)
81
 
82
+ # exact match only
83
+ if q in EXACT_INDEX:
84
+ return EXACT_INDEX[q]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # ✅ fallback: check if full phrase exists inside PDF
87
+ for key in EXACT_INDEX:
88
+ if q == key:
89
+ return EXACT_INDEX[key]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ return None
92
 
93
  # =========================================================
94
+ # MAIN ANSWER
95
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
97
  def get_answer(question, audience, excel_mapping=None):
 
 
98
 
99
+ if not question or not question.strip():
 
 
100
  return (
101
+ "Ask a KPI question to see the summary cards.",
102
+ "Please enter a KPI question.",
103
+ "",
104
+ "",
105
+ "",
106
+ "No comparison available."
107
  )
108
 
109
+ result = get_exact_match(question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ if result:
112
+ return (