sadickam commited on
Commit
3a4f3dc
·
1 Parent(s): d93fa4c

feat: enhance chunking and extraction processes with debug artifact handling and function name corrections

Browse files
.gitignore CHANGED
@@ -165,3 +165,6 @@ tests/
165
  !.pre-commit-config.yaml
166
  !.env.example
167
  !.dockerignore
 
 
 
 
165
  !.pre-commit-config.yaml
166
  !.env.example
167
  !.dockerignore
168
+
169
+ # Local commands/notes
170
+ commands.md
scripts/chunk.py CHANGED
@@ -385,6 +385,11 @@ def _get_chunker() -> Chunker:
385
  # =============================================================================
386
 
387
 
 
 
 
 
 
388
  def _compute_file_hash(path: Path) -> str:
389
  """Compute SHA-256 hash of a file's contents.
390
 
@@ -661,7 +666,11 @@ def run_chunking( # noqa: PLR0912, PLR0915
661
  # -------------------------------------------------------------------------
662
  # Find all Markdown files in input directory (non-recursive, top-level only)
663
  # -------------------------------------------------------------------------
664
- md_files = sorted([f for f in input_dir.glob("*.md") if f.is_file()])
 
 
 
 
665
  total_files = len(md_files)
666
 
667
  # -------------------------------------------------------------------------
@@ -931,7 +940,13 @@ def main(argv: list[str] | None = None) -> int: # noqa: PLR0911
931
  # -------------------------------------------------------------------------
932
  # Check for Markdown files
933
  # -------------------------------------------------------------------------
934
- md_files = list(args.input_dir.glob("*.md"))
 
 
 
 
 
 
935
  if not md_files:
936
  print(
937
  f"Error: No markdown files found in {args.input_dir}",
 
385
  # =============================================================================
386
 
387
 
388
+ def _is_debug_artifact(path: Path) -> bool:
389
+ """Return True if the path is a debug artifact that should be excluded."""
390
+ return path.name.endswith(".raw.md")
391
+
392
+
393
  def _compute_file_hash(path: Path) -> str:
394
  """Compute SHA-256 hash of a file's contents.
395
 
 
666
  # -------------------------------------------------------------------------
667
  # Find all Markdown files in input directory (non-recursive, top-level only)
668
  # -------------------------------------------------------------------------
669
+ all_md_files = sorted([f for f in input_dir.glob("*.md") if f.is_file()])
670
+ md_files = [f for f in all_md_files if not _is_debug_artifact(f)]
671
+ skipped_debug = len(all_md_files) - len(md_files)
672
+ if skipped_debug > 0 and verbose and not quiet:
673
+ print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
674
  total_files = len(md_files)
675
 
676
  # -------------------------------------------------------------------------
 
940
  # -------------------------------------------------------------------------
941
  # Check for Markdown files
942
  # -------------------------------------------------------------------------
943
+ md_files = [f for f in args.input_dir.glob("*.md") if not _is_debug_artifact(f)]
944
+ if args.verbose and args.quiet:
945
+ pass
946
+ elif args.verbose:
947
+ skipped_debug = len(list(args.input_dir.glob("*.md"))) - len(md_files)
948
+ if skipped_debug > 0:
949
+ print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
950
  if not md_files:
951
  print(
952
  f"Error: No markdown files found in {args.input_dir}",
scripts/extract.py CHANGED
@@ -42,6 +42,7 @@ Note:
42
  from __future__ import annotations
43
 
44
  import argparse
 
45
  import sys
46
  import time
47
  from dataclasses import dataclass
@@ -216,6 +217,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
216
  - force: bool - Whether to overwrite existing files
217
  - verbose: bool - Whether to show detailed output
218
  - quiet: bool - Whether to suppress output
 
219
 
220
  Raises:
221
  ------
@@ -296,6 +298,14 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
296
  default=False,
297
  help="Suppress all output except errors (still shows summary)",
298
  )
 
 
 
 
 
 
 
 
299
 
300
  # -------------------------------------------------------------------------
301
  # Parse and return arguments
@@ -425,6 +435,7 @@ def run_extraction( # noqa: PLR0912, PLR0915
425
  force: bool,
426
  verbose: bool,
427
  quiet: bool,
 
428
  ) -> ExtractionStatistics:
429
  """Run the PDF extraction process on all PDF files in the input directory.
430
 
@@ -449,6 +460,8 @@ def run_extraction( # noqa: PLR0912, PLR0915
449
  If True, print detailed information including file names.
450
  quiet : bool
451
  If True, suppress progress bar (but still print summary).
 
 
452
 
453
  Returns:
454
  -------
@@ -545,10 +558,22 @@ def run_extraction( # noqa: PLR0912, PLR0915
545
  # -------------------------------------------------------------------------
546
  # Process each PDF file
547
  # -------------------------------------------------------------------------
 
 
 
 
548
  for idx, pdf_path in enumerate(progress_bar):
549
  # Determine output path
550
  md_filename = pdf_path.stem + ".md"
551
  md_path = output_dir / md_filename
 
 
 
 
 
 
 
 
552
 
553
  # Check if extraction is needed
554
  if not _should_extract(pdf_path, md_path, force):
@@ -572,7 +597,30 @@ def run_extraction( # noqa: PLR0912, PLR0915
572
 
573
  # Get markdown content and apply converter
574
  raw_markdown = document.to_markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  clean_markdown = converter.convert(raw_markdown)
 
 
 
 
 
 
 
 
 
 
576
 
577
  # Write output file
578
  md_path.write_text(clean_markdown, encoding="utf-8")
@@ -701,6 +749,7 @@ def main(argv: list[str] | None = None) -> int: # noqa: PLR0911
701
  force=args.force,
702
  verbose=args.verbose,
703
  quiet=args.quiet,
 
704
  )
705
 
706
  # -------------------------------------------------------------------------
 
42
  from __future__ import annotations
43
 
44
  import argparse
45
+ import hashlib
46
  import sys
47
  import time
48
  from dataclasses import dataclass
 
217
  - force: bool - Whether to overwrite existing files
218
  - verbose: bool - Whether to show detailed output
219
  - quiet: bool - Whether to suppress output
220
+ - dump_raw_for: str | None - Optional PDF filename/stem to dump raw output
221
 
222
  Raises:
223
  ------
 
298
  default=False,
299
  help="Suppress all output except errors (still shows summary)",
300
  )
301
+ parser.add_argument(
302
+ "--dump-raw-for",
303
+ default=None,
304
+ help=(
305
+ "Dump raw pymupdf4llm markdown (pre-MarkdownConverter) for a "
306
+ "matching PDF filename or stem to <output_dir>/<stem>.raw.md"
307
+ ),
308
+ )
309
 
310
  # -------------------------------------------------------------------------
311
  # Parse and return arguments
 
435
  force: bool,
436
  verbose: bool,
437
  quiet: bool,
438
+ dump_raw_for: str | None = None,
439
  ) -> ExtractionStatistics:
440
  """Run the PDF extraction process on all PDF files in the input directory.
441
 
 
460
  If True, print detailed information including file names.
461
  quiet : bool
462
  If True, suppress progress bar (but still print summary).
463
+ dump_raw_for : str | None
464
+ Optional PDF filename or stem to dump raw markdown for.
465
 
466
  Returns:
467
  -------
 
558
  # -------------------------------------------------------------------------
559
  # Process each PDF file
560
  # -------------------------------------------------------------------------
561
+ raw_target = (dump_raw_for or "").strip()
562
+ raw_target_lower = raw_target.lower()
563
+ raw_target_is_filename = raw_target_lower.endswith(".pdf")
564
+
565
  for idx, pdf_path in enumerate(progress_bar):
566
  # Determine output path
567
  md_filename = pdf_path.stem + ".md"
568
  md_path = output_dir / md_filename
569
+ pdf_name_lower = pdf_path.name.lower()
570
+ pdf_stem_lower = pdf_path.stem.lower()
571
+ raw_dump_match = False
572
+ if raw_target_lower:
573
+ if raw_target_is_filename:
574
+ raw_dump_match = pdf_name_lower == raw_target_lower
575
+ else:
576
+ raw_dump_match = raw_target_lower in {pdf_name_lower, pdf_stem_lower}
577
 
578
  # Check if extraction is needed
579
  if not _should_extract(pdf_path, md_path, force):
 
597
 
598
  # Get markdown content and apply converter
599
  raw_markdown = document.to_markdown()
600
+ if raw_dump_match:
601
+ raw_dump_path = output_dir / f"{pdf_path.stem}.raw.md"
602
+ raw_dump_path.write_text(raw_markdown, encoding="utf-8")
603
+ if verbose:
604
+ print(f" Wrote raw markdown: {raw_dump_path}")
605
+ if verbose and raw_dump_match:
606
+ raw_hash = hashlib.sha256(raw_markdown.encode("utf-8")).hexdigest()[:12]
607
+ raw_underscore_count = raw_markdown.count("_")
608
+ print(
609
+ " Raw checksum/underscores:",
610
+ raw_hash,
611
+ f"underscores={raw_underscore_count}",
612
+ )
613
  clean_markdown = converter.convert(raw_markdown)
614
+ if verbose and raw_dump_match:
615
+ clean_hash = hashlib.sha256(clean_markdown.encode("utf-8")).hexdigest()[
616
+ :12
617
+ ]
618
+ clean_underscore_count = clean_markdown.count("_")
619
+ print(
620
+ " Clean checksum/underscores:",
621
+ clean_hash,
622
+ f"underscores={clean_underscore_count}",
623
+ )
624
 
625
  # Write output file
626
  md_path.write_text(clean_markdown, encoding="utf-8")
 
749
  force=args.force,
750
  verbose=args.verbose,
751
  quiet=args.quiet,
752
+ dump_raw_for=args.dump_raw_for,
753
  )
754
 
755
  # -------------------------------------------------------------------------
src/rag_chatbot/chunking/chunker.py CHANGED
@@ -402,8 +402,25 @@ class Chunker:
402
  sliding_chunker = self._get_sliding_chunker()
403
  chunks: list[Chunk] = []
404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  # Normalize the content text
406
- normalized_content = self._normalize_text(block.content)
407
 
408
  if not normalized_content.strip():
409
  return chunks, chunk_index_offset
@@ -435,6 +452,19 @@ class Chunker:
435
 
436
  return chunks, chunk_index_offset + len(chunks)
437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  def chunk_document(
439
  self,
440
  markdown: str,
 
402
  sliding_chunker = self._get_sliding_chunker()
403
  chunks: list[Chunk] = []
404
 
405
+ raw_content = block.content
406
+ if self._is_code_block(raw_content):
407
+ token_count = sliding_chunker.tokenizer.count_tokens(raw_content)
408
+ chunk_id = _generate_chunk_id(source, base_page, chunk_index_offset)
409
+ chunk = Chunk(
410
+ chunk_id=chunk_id,
411
+ text=raw_content,
412
+ heading_path=list(block.heading_path),
413
+ source=source,
414
+ page=base_page,
415
+ start_char=0,
416
+ end_char=len(raw_content),
417
+ token_count=token_count,
418
+ )
419
+ chunks.append(chunk)
420
+ return chunks, chunk_index_offset + 1
421
+
422
  # Normalize the content text
423
+ normalized_content = self._normalize_text(raw_content)
424
 
425
  if not normalized_content.strip():
426
  return chunks, chunk_index_offset
 
452
 
453
  return chunks, chunk_index_offset + len(chunks)
454
 
455
+ @staticmethod
456
+ def _is_code_block(content: str) -> bool:
457
+ """Return True when the content represents a standalone code block."""
458
+ if not content:
459
+ return False
460
+ stripped = content.lstrip()
461
+ if stripped.startswith("```") or stripped.startswith("~~~"):
462
+ return True
463
+ lines = [line for line in content.splitlines() if line.strip()]
464
+ if not lines:
465
+ return False
466
+ return all(line.startswith(" ") or line.startswith("\t") for line in lines)
467
+
468
  def chunk_document(
469
  self,
470
  markdown: str,
src/rag_chatbot/chunking/models.py CHANGED
@@ -161,7 +161,7 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
161
  "ther mal com fort": "thermal comfort",
162
  "ther mal": "thermal",
163
  "com fort": "comfort",
164
- # Common function names from the library
165
  "pmv_ppd": "pmv_ppd",
166
  "adaptive_ashrae": "adaptive_ashrae",
167
  "adaptive_en": "adaptive_en",
@@ -183,6 +183,66 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
183
  "two_nodes": "two_nodes",
184
  "solar_altitude": "solar_altitude",
185
  "mean_radiant_temperature": "mean_radiant_temperature",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  }
187
 
188
  # Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
@@ -230,8 +290,19 @@ _HTML_COMMENT_PATTERN: re.Pattern[str] = re.compile(r"<!--.*?-->", re.DOTALL)
230
 
231
  # Technical terms that should NOT be segmented
232
  # These are valid compound words or domain-specific terms
 
 
 
 
 
 
 
 
 
 
233
  _PROTECTED_TERMS: frozenset[str] = frozenset(
234
  {
 
235
  "pythermalcomfort",
236
  "thermalcomfort",
237
  "metabolicrate",
@@ -242,7 +313,60 @@ _PROTECTED_TERMS: frozenset[str] = frozenset(
242
  "physiological",
243
  "temperature",
244
  "temperatures",
245
- # Add more as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  }
247
  )
248
 
@@ -600,6 +724,14 @@ class TextNormalizer:
600
  result_words.append(word)
601
  continue
602
 
 
 
 
 
 
 
 
 
603
  # Skip protected terms (case-insensitive)
604
  if stripped.lower() in _PROTECTED_TERMS:
605
  result_words.append(word)
 
161
  "ther mal com fort": "thermal comfort",
162
  "ther mal": "thermal",
163
  "com fort": "comfort",
164
+ # Common function names from the library (preserving underscores)
165
  "pmv_ppd": "pmv_ppd",
166
  "adaptive_ashrae": "adaptive_ashrae",
167
  "adaptive_en": "adaptive_en",
 
183
  "two_nodes": "two_nodes",
184
  "solar_altitude": "solar_altitude",
185
  "mean_radiant_temperature": "mean_radiant_temperature",
186
+ # =========================================================================
187
+ # pythermalcomfort function name corrections
188
+ # Maps concatenated versions (without underscores) to correct snake_case
189
+ # This handles cases where PDF extraction strips underscores from names.
190
+ # =========================================================================
191
+ # Models - PMV/PPD variants
192
+ "pmvppdashrae": "pmv_ppd_ashrae",
193
+ "pmvppdiso": "pmv_ppd_iso",
194
+ "pmvathb": "pmv_athb",
195
+ "pmva": "pmv_a",
196
+ "pmve": "pmv_e",
197
+ # Models - Adaptive comfort
198
+ "adaptiveashrae": "adaptive_ashrae",
199
+ "adaptiveen": "adaptive_en",
200
+ # Models - Two-node models
201
+ "twonodesgagge": "two_nodes_gagge",
202
+ "twonodesgaggesleep": "two_nodes_gagge_sleep",
203
+ "twonodesgaggeji": "two_nodes_gagge_ji",
204
+ # Models - Heat indices
205
+ "heatindexlu": "heat_index_lu",
206
+ "heatindexrothfusz": "heat_index_rothfusz",
207
+ "discomfortindex": "discomfort_index",
208
+ # Models - Other thermal indices
209
+ "petsteady": "pet_steady",
210
+ "settmp": "set_tmp",
211
+ "coolingeffect": "cooling_effect",
212
+ "solargain": "solar_gain",
213
+ "usefansheatwaves": "use_fans_heatwaves",
214
+ "verticaltmpgradppd": "vertical_tmp_grad_ppd",
215
+ "ankledraft": "ankle_draft",
216
+ "clotout": "clo_tout",
217
+ # Models - Work capacity
218
+ "workcapacitydunne": "work_capacity_dunne",
219
+ "workcapacityhothaps": "work_capacity_hothaps",
220
+ "workcapacityiso": "work_capacity_iso",
221
+ "workcapacityniosh": "work_capacity_niosh",
222
+ # Models - Wind chill
223
+ "windchilltemperature": "wind_chill_temperature",
224
+ # Utilities - Temperature and psychrometrics
225
+ "runningmeanoutdoortemperature": "running_mean_outdoor_temperature",
226
+ "meanradianttmp": "mean_radiant_tmp",
227
+ "operativetmp": "operative_tmp",
228
+ "dewpointtmp": "dew_point_tmp",
229
+ "wetbulbtmp": "wet_bulb_tmp",
230
+ "enthalpyair": "enthalpy_air",
231
+ "bodysurfacearea": "body_surface_area",
232
+ "psytarh": "psy_ta_rh",
233
+ "vrelative": "v_relative",
234
+ "unitsconverter": "units_converter",
235
+ # Utilities - Clothing functions
236
+ "clodynamicashrae": "clo_dynamic_ashrae",
237
+ "clodynamiciso": "clo_dynamic_iso",
238
+ "cloinsulationairlayer": "clo_insulation_air_layer",
239
+ "cloareafactor": "clo_area_factor",
240
+ "clocorrectionfactorenvironment": "clo_correction_factor_environment",
241
+ "clointrinsicinsulatioensemble": "clo_intrinsic_insulation_ensemble",
242
+ "clototalinsulation": "clo_total_insulation",
243
+ "clotypicalensembles": "clo_typical_ensembles",
244
+ "cloindividualgarments": "clo_individual_garments",
245
+ "mettypicaltasks": "met_typical_tasks",
246
  }
247
 
248
  # Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
 
290
 
291
  # Technical terms that should NOT be segmented
292
  # These are valid compound words or domain-specific terms
293
+ #
294
+ # IMPORTANT: This list includes pythermalcomfort function names in their
295
+ # concatenated form (without underscores) because PDF extraction sometimes
296
+ # strips underscores. When a word like "pmvppdashrae" is encountered, it
297
+ # should NOT be segmented into "pmv ppd ashrae" - instead, it should be
298
+ # preserved so that downstream processing or the LLM can recognise it as
299
+ # a function name variant.
300
+ #
301
+ # The function names are extracted from:
302
+ # pythermalcomfort-readthedocs-io-en-latest.pdf (official documentation)
303
  _PROTECTED_TERMS: frozenset[str] = frozenset(
304
  {
305
+ # General technical terms
306
  "pythermalcomfort",
307
  "thermalcomfort",
308
  "metabolicrate",
 
313
  "physiological",
314
  "temperature",
315
  "temperatures",
316
+ # =====================================================================
317
+ # pythermalcomfort.models function names (concatenated, lowercase)
318
+ # These protect against incorrect segmentation of function names
319
+ # when underscores are stripped during PDF extraction.
320
+ # =====================================================================
321
+ "adaptiveashrae", # adaptive_ashrae
322
+ "adaptiveen", # adaptive_en
323
+ "ankledraft", # ankle_draft
324
+ "clotout", # clo_tout
325
+ "coolingeffect", # cooling_effect
326
+ "discomfortindex", # discomfort_index
327
+ "twonodesgagge", # two_nodes_gagge
328
+ "twonodesgaggesleep", # two_nodes_gagge_sleep
329
+ "twonodesgaggeji", # two_nodes_gagge_ji
330
+ "heatindexlu", # heat_index_lu
331
+ "heatindexrothfusz", # heat_index_rothfusz
332
+ "petsteady", # pet_steady
333
+ "pmvppdiso", # pmv_ppd_iso
334
+ "pmvppdashrae", # pmv_ppd_ashrae
335
+ "pmvathb", # pmv_athb
336
+ "solargain", # solar_gain
337
+ "settmp", # set_tmp
338
+ "usefansheatwaves", # use_fans_heatwaves
339
+ "verticaltmpgradppd", # vertical_tmp_grad_ppd
340
+ "windchilltemperature", # wind_chill_temperature
341
+ "workcapacitydunne", # work_capacity_dunne
342
+ "workcapacityhothaps", # work_capacity_hothaps
343
+ "workcapacityiso", # work_capacity_iso
344
+ "workcapacityniosh", # work_capacity_niosh
345
+ # =====================================================================
346
+ # pythermalcomfort.utilities function names (concatenated, lowercase)
347
+ # =====================================================================
348
+ "runningmeanoutdoortemperature", # running_mean_outdoor_temperature
349
+ "vrelative", # v_relative
350
+ "clodynamicashrae", # clo_dynamic_ashrae
351
+ "clodynamiciso", # clo_dynamic_iso
352
+ "bodysurfacearea", # body_surface_area
353
+ "dewpointtmp", # dew_point_tmp
354
+ "enthalpyair", # enthalpy_air
355
+ "meanradianttmp", # mean_radiant_tmp
356
+ "operativetmp", # operative_tmp
357
+ "psytarh", # psy_ta_rh
358
+ "psat", # p_sat
359
+ "fsvv", # f_svv
360
+ "unitsconverter", # units_converter
361
+ "wetbulbtmp", # wet_bulb_tmp
362
+ "cloinsulationairlayer", # clo_insulation_air_layer
363
+ "cloareafactor", # clo_area_factor
364
+ "clocorrectionfactorenvironment", # clo_correction_factor_environment
365
+ "clointrinsicinsulatioensemble", # clo_intrinsic_insulation_ensemble
366
+ "clototalinsulation", # clo_total_insulation
367
+ "clotypicalensembles", # clo_typical_ensembles
368
+ "cloindividualgarments", # clo_individual_garments
369
+ "mettypicaltasks", # met_typical_tasks
370
  }
371
  )
372
 
 
724
  result_words.append(word)
725
  continue
726
 
727
+ # Skip words containing underscores - these are Python identifiers
728
+ # (e.g., pmv_ppd_ashrae, clo_dynamic_iso) that should be preserved
729
+ # exactly as-is. Underscores in function names are intentional and
730
+ # segmenting them would corrupt the identifier.
731
+ if "_" in stripped:
732
+ result_words.append(word)
733
+ continue
734
+
735
  # Skip protected terms (case-insensitive)
736
  if stripped.lower() in _PROTECTED_TERMS:
737
  result_words.append(word)
src/rag_chatbot/llm/prompts.py CHANGED
@@ -163,6 +163,22 @@ ISO 7243, ISO 7933
163
  - The pythermalcomfort Python library: available functions, required \
164
  parameters, return values, and practical usage examples
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  **Response Guidelines:**
167
  1. ONLY answer questions using information from the provided context. If the \
168
  context does not contain enough information to fully answer a question, \
@@ -189,14 +205,20 @@ cited sources in numbered order.
189
  3. For code examples, use accurate pythermalcomfort library syntax. Always \
190
  include necessary imports and realistic parameter values with proper units.
191
 
192
- 4. Be precise with technical terminology and units:
 
 
 
 
 
 
193
  - Temperature: °C (degrees Celsius)
194
  - Air velocity: m/s (meters per second)
195
  - Metabolic rate: met (1 met = 58.2 W/m²)
196
  - Clothing insulation: clo (1 clo = 0.155 m²·K/W)
197
  - Relative humidity: % (percentage)
198
 
199
- 5. For mathematical formulas and equations, you MUST use LaTeX syntax with \
200
  dollar sign delimiters:
201
  - Inline math: Use single dollar signs for ANY math expression, including \
202
  subscripts, superscripts, and variables. Examples:
@@ -210,11 +232,11 @@ equations, e.g.,
210
  dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
211
  - Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
212
 
213
- 6. If a question is ambiguous, ask for clarification about the specific \
214
  standard, thermal comfort model, environmental conditions, or use case the \
215
  user is interested in.
216
 
217
- 7. When explaining thermal comfort concepts, provide context about why they \
218
  matter and how they relate to human comfort, health, and building design.
219
 
220
  **Response Formatting:**
 
163
  - The pythermalcomfort Python library: available functions, required \
164
  parameters, return values, and practical usage examples
165
 
166
+ **Important Limitations:**
167
+ You are a documentation assistant, NOT a calculator or code execution environment. \
168
+ You CANNOT:
169
+ - Execute code or perform calculations directly
170
+ - Run the pythermalcomfort library functions
171
+ - Generate or visualise results from calculations
172
+
173
+ You CAN:
174
+ - Explain thermal comfort concepts and models
175
+ - Provide ready-to-run Python code snippets that users can copy and execute themselves
176
+ - Answer questions about function parameters, return values, and usage patterns
177
+
178
+ When asked "what can you do", describe yourself as a documentation assistant that \
179
+ explains concepts and provides code examples - never suggest you can perform \
180
+ calculations or execute code.
181
+
182
  **Response Guidelines:**
183
  1. ONLY answer questions using information from the provided context. If the \
184
  context does not contain enough information to fully answer a question, \
 
205
  3. For code examples, use accurate pythermalcomfort library syntax. Always \
206
  include necessary imports and realistic parameter values with proper units.
207
 
208
+ 4. **Function Naming Convention:**
209
+ pythermalcomfort uses **snake_case** (lowercase with underscores) for ALL \
210
+ function names. Use exact names from the retrieved context (e.g., `pmv_ppd_ashrae()`, \
211
+ `adaptive_en()`, `two_nodes_gagge()`). Standard parameter names: `tdb`, `tr`, `v`, \
212
+ `vr`, `rh`, `met`, `clo`, `wme`.
213
+
214
+ 5. Be precise with technical terminology and units:
215
  - Temperature: °C (degrees Celsius)
216
  - Air velocity: m/s (meters per second)
217
  - Metabolic rate: met (1 met = 58.2 W/m²)
218
  - Clothing insulation: clo (1 clo = 0.155 m²·K/W)
219
  - Relative humidity: % (percentage)
220
 
221
+ 6. For mathematical formulas and equations, you MUST use LaTeX syntax with \
222
  dollar sign delimiters:
223
  - Inline math: Use single dollar signs for ANY math expression, including \
224
  subscripts, superscripts, and variables. Examples:
 
232
  dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
233
  - Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
234
 
235
+ 7. If a question is ambiguous, ask for clarification about the specific \
236
  standard, thermal comfort model, environmental conditions, or use case the \
237
  user is interested in.
238
 
239
+ 8. When explaining thermal comfort concepts, provide context about why they \
240
  matter and how they relate to human comfort, health, and building design.
241
 
242
  **Response Formatting:**