Spaces:
Running
Running
feat: enhance chunking and extraction processes with debug artifact handling and function name corrections
Browse files- .gitignore +3 -0
- scripts/chunk.py +17 -2
- scripts/extract.py +49 -0
- src/rag_chatbot/chunking/chunker.py +31 -1
- src/rag_chatbot/chunking/models.py +134 -2
- src/rag_chatbot/llm/prompts.py +26 -4
.gitignore
CHANGED
|
@@ -165,3 +165,6 @@ tests/
|
|
| 165 |
!.pre-commit-config.yaml
|
| 166 |
!.env.example
|
| 167 |
!.dockerignore
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
!.pre-commit-config.yaml
|
| 166 |
!.env.example
|
| 167 |
!.dockerignore
|
| 168 |
+
|
| 169 |
+
# Local commands/notes
|
| 170 |
+
commands.md
|
scripts/chunk.py
CHANGED
|
@@ -385,6 +385,11 @@ def _get_chunker() -> Chunker:
|
|
| 385 |
# =============================================================================
|
| 386 |
|
| 387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
def _compute_file_hash(path: Path) -> str:
|
| 389 |
"""Compute SHA-256 hash of a file's contents.
|
| 390 |
|
|
@@ -661,7 +666,11 @@ def run_chunking( # noqa: PLR0912, PLR0915
|
|
| 661 |
# -------------------------------------------------------------------------
|
| 662 |
# Find all Markdown files in input directory (non-recursive, top-level only)
|
| 663 |
# -------------------------------------------------------------------------
|
| 664 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
total_files = len(md_files)
|
| 666 |
|
| 667 |
# -------------------------------------------------------------------------
|
|
@@ -931,7 +940,13 @@ def main(argv: list[str] | None = None) -> int: # noqa: PLR0911
|
|
| 931 |
# -------------------------------------------------------------------------
|
| 932 |
# Check for Markdown files
|
| 933 |
# -------------------------------------------------------------------------
|
| 934 |
-
md_files =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
if not md_files:
|
| 936 |
print(
|
| 937 |
f"Error: No markdown files found in {args.input_dir}",
|
|
|
|
| 385 |
# =============================================================================
|
| 386 |
|
| 387 |
|
| 388 |
+
def _is_debug_artifact(path: Path) -> bool:
|
| 389 |
+
"""Return True if the path is a debug artifact that should be excluded."""
|
| 390 |
+
return path.name.endswith(".raw.md")
|
| 391 |
+
|
| 392 |
+
|
| 393 |
def _compute_file_hash(path: Path) -> str:
|
| 394 |
"""Compute SHA-256 hash of a file's contents.
|
| 395 |
|
|
|
|
| 666 |
# -------------------------------------------------------------------------
|
| 667 |
# Find all Markdown files in input directory (non-recursive, top-level only)
|
| 668 |
# -------------------------------------------------------------------------
|
| 669 |
+
all_md_files = sorted([f for f in input_dir.glob("*.md") if f.is_file()])
|
| 670 |
+
md_files = [f for f in all_md_files if not _is_debug_artifact(f)]
|
| 671 |
+
skipped_debug = len(all_md_files) - len(md_files)
|
| 672 |
+
if skipped_debug > 0 and verbose and not quiet:
|
| 673 |
+
print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
|
| 674 |
total_files = len(md_files)
|
| 675 |
|
| 676 |
# -------------------------------------------------------------------------
|
|
|
|
| 940 |
# -------------------------------------------------------------------------
|
| 941 |
# Check for Markdown files
|
| 942 |
# -------------------------------------------------------------------------
|
| 943 |
+
md_files = [f for f in args.input_dir.glob("*.md") if not _is_debug_artifact(f)]
|
| 944 |
+
if args.verbose and args.quiet:
|
| 945 |
+
pass
|
| 946 |
+
elif args.verbose:
|
| 947 |
+
skipped_debug = len(list(args.input_dir.glob("*.md"))) - len(md_files)
|
| 948 |
+
if skipped_debug > 0:
|
| 949 |
+
print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
|
| 950 |
if not md_files:
|
| 951 |
print(
|
| 952 |
f"Error: No markdown files found in {args.input_dir}",
|
scripts/extract.py
CHANGED
|
@@ -42,6 +42,7 @@ Note:
|
|
| 42 |
from __future__ import annotations
|
| 43 |
|
| 44 |
import argparse
|
|
|
|
| 45 |
import sys
|
| 46 |
import time
|
| 47 |
from dataclasses import dataclass
|
|
@@ -216,6 +217,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
| 216 |
- force: bool - Whether to overwrite existing files
|
| 217 |
- verbose: bool - Whether to show detailed output
|
| 218 |
- quiet: bool - Whether to suppress output
|
|
|
|
| 219 |
|
| 220 |
Raises:
|
| 221 |
------
|
|
@@ -296,6 +298,14 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
| 296 |
default=False,
|
| 297 |
help="Suppress all output except errors (still shows summary)",
|
| 298 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
# -------------------------------------------------------------------------
|
| 301 |
# Parse and return arguments
|
|
@@ -425,6 +435,7 @@ def run_extraction( # noqa: PLR0912, PLR0915
|
|
| 425 |
force: bool,
|
| 426 |
verbose: bool,
|
| 427 |
quiet: bool,
|
|
|
|
| 428 |
) -> ExtractionStatistics:
|
| 429 |
"""Run the PDF extraction process on all PDF files in the input directory.
|
| 430 |
|
|
@@ -449,6 +460,8 @@ def run_extraction( # noqa: PLR0912, PLR0915
|
|
| 449 |
If True, print detailed information including file names.
|
| 450 |
quiet : bool
|
| 451 |
If True, suppress progress bar (but still print summary).
|
|
|
|
|
|
|
| 452 |
|
| 453 |
Returns:
|
| 454 |
-------
|
|
@@ -545,10 +558,22 @@ def run_extraction( # noqa: PLR0912, PLR0915
|
|
| 545 |
# -------------------------------------------------------------------------
|
| 546 |
# Process each PDF file
|
| 547 |
# -------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
for idx, pdf_path in enumerate(progress_bar):
|
| 549 |
# Determine output path
|
| 550 |
md_filename = pdf_path.stem + ".md"
|
| 551 |
md_path = output_dir / md_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
|
| 553 |
# Check if extraction is needed
|
| 554 |
if not _should_extract(pdf_path, md_path, force):
|
|
@@ -572,7 +597,30 @@ def run_extraction( # noqa: PLR0912, PLR0915
|
|
| 572 |
|
| 573 |
# Get markdown content and apply converter
|
| 574 |
raw_markdown = document.to_markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
clean_markdown = converter.convert(raw_markdown)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
# Write output file
|
| 578 |
md_path.write_text(clean_markdown, encoding="utf-8")
|
|
@@ -701,6 +749,7 @@ def main(argv: list[str] | None = None) -> int: # noqa: PLR0911
|
|
| 701 |
force=args.force,
|
| 702 |
verbose=args.verbose,
|
| 703 |
quiet=args.quiet,
|
|
|
|
| 704 |
)
|
| 705 |
|
| 706 |
# -------------------------------------------------------------------------
|
|
|
|
| 42 |
from __future__ import annotations
|
| 43 |
|
| 44 |
import argparse
|
| 45 |
+
import hashlib
|
| 46 |
import sys
|
| 47 |
import time
|
| 48 |
from dataclasses import dataclass
|
|
|
|
| 217 |
- force: bool - Whether to overwrite existing files
|
| 218 |
- verbose: bool - Whether to show detailed output
|
| 219 |
- quiet: bool - Whether to suppress output
|
| 220 |
+
- dump_raw_for: str | None - Optional PDF filename/stem to dump raw output
|
| 221 |
|
| 222 |
Raises:
|
| 223 |
------
|
|
|
|
| 298 |
default=False,
|
| 299 |
help="Suppress all output except errors (still shows summary)",
|
| 300 |
)
|
| 301 |
+
parser.add_argument(
|
| 302 |
+
"--dump-raw-for",
|
| 303 |
+
default=None,
|
| 304 |
+
help=(
|
| 305 |
+
"Dump raw pymupdf4llm markdown (pre-MarkdownConverter) for a "
|
| 306 |
+
"matching PDF filename or stem to <output_dir>/<stem>.raw.md"
|
| 307 |
+
),
|
| 308 |
+
)
|
| 309 |
|
| 310 |
# -------------------------------------------------------------------------
|
| 311 |
# Parse and return arguments
|
|
|
|
| 435 |
force: bool,
|
| 436 |
verbose: bool,
|
| 437 |
quiet: bool,
|
| 438 |
+
dump_raw_for: str | None = None,
|
| 439 |
) -> ExtractionStatistics:
|
| 440 |
"""Run the PDF extraction process on all PDF files in the input directory.
|
| 441 |
|
|
|
|
| 460 |
If True, print detailed information including file names.
|
| 461 |
quiet : bool
|
| 462 |
If True, suppress progress bar (but still print summary).
|
| 463 |
+
dump_raw_for : str | None
|
| 464 |
+
Optional PDF filename or stem to dump raw markdown for.
|
| 465 |
|
| 466 |
Returns:
|
| 467 |
-------
|
|
|
|
| 558 |
# -------------------------------------------------------------------------
|
| 559 |
# Process each PDF file
|
| 560 |
# -------------------------------------------------------------------------
|
| 561 |
+
raw_target = (dump_raw_for or "").strip()
|
| 562 |
+
raw_target_lower = raw_target.lower()
|
| 563 |
+
raw_target_is_filename = raw_target_lower.endswith(".pdf")
|
| 564 |
+
|
| 565 |
for idx, pdf_path in enumerate(progress_bar):
|
| 566 |
# Determine output path
|
| 567 |
md_filename = pdf_path.stem + ".md"
|
| 568 |
md_path = output_dir / md_filename
|
| 569 |
+
pdf_name_lower = pdf_path.name.lower()
|
| 570 |
+
pdf_stem_lower = pdf_path.stem.lower()
|
| 571 |
+
raw_dump_match = False
|
| 572 |
+
if raw_target_lower:
|
| 573 |
+
if raw_target_is_filename:
|
| 574 |
+
raw_dump_match = pdf_name_lower == raw_target_lower
|
| 575 |
+
else:
|
| 576 |
+
raw_dump_match = raw_target_lower in {pdf_name_lower, pdf_stem_lower}
|
| 577 |
|
| 578 |
# Check if extraction is needed
|
| 579 |
if not _should_extract(pdf_path, md_path, force):
|
|
|
|
| 597 |
|
| 598 |
# Get markdown content and apply converter
|
| 599 |
raw_markdown = document.to_markdown()
|
| 600 |
+
if raw_dump_match:
|
| 601 |
+
raw_dump_path = output_dir / f"{pdf_path.stem}.raw.md"
|
| 602 |
+
raw_dump_path.write_text(raw_markdown, encoding="utf-8")
|
| 603 |
+
if verbose:
|
| 604 |
+
print(f" Wrote raw markdown: {raw_dump_path}")
|
| 605 |
+
if verbose and raw_dump_match:
|
| 606 |
+
raw_hash = hashlib.sha256(raw_markdown.encode("utf-8")).hexdigest()[:12]
|
| 607 |
+
raw_underscore_count = raw_markdown.count("_")
|
| 608 |
+
print(
|
| 609 |
+
" Raw checksum/underscores:",
|
| 610 |
+
raw_hash,
|
| 611 |
+
f"underscores={raw_underscore_count}",
|
| 612 |
+
)
|
| 613 |
clean_markdown = converter.convert(raw_markdown)
|
| 614 |
+
if verbose and raw_dump_match:
|
| 615 |
+
clean_hash = hashlib.sha256(clean_markdown.encode("utf-8")).hexdigest()[
|
| 616 |
+
:12
|
| 617 |
+
]
|
| 618 |
+
clean_underscore_count = clean_markdown.count("_")
|
| 619 |
+
print(
|
| 620 |
+
" Clean checksum/underscores:",
|
| 621 |
+
clean_hash,
|
| 622 |
+
f"underscores={clean_underscore_count}",
|
| 623 |
+
)
|
| 624 |
|
| 625 |
# Write output file
|
| 626 |
md_path.write_text(clean_markdown, encoding="utf-8")
|
|
|
|
| 749 |
force=args.force,
|
| 750 |
verbose=args.verbose,
|
| 751 |
quiet=args.quiet,
|
| 752 |
+
dump_raw_for=args.dump_raw_for,
|
| 753 |
)
|
| 754 |
|
| 755 |
# -------------------------------------------------------------------------
|
src/rag_chatbot/chunking/chunker.py
CHANGED
|
@@ -402,8 +402,25 @@ class Chunker:
|
|
| 402 |
sliding_chunker = self._get_sliding_chunker()
|
| 403 |
chunks: list[Chunk] = []
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
# Normalize the content text
|
| 406 |
-
normalized_content = self._normalize_text(
|
| 407 |
|
| 408 |
if not normalized_content.strip():
|
| 409 |
return chunks, chunk_index_offset
|
|
@@ -435,6 +452,19 @@ class Chunker:
|
|
| 435 |
|
| 436 |
return chunks, chunk_index_offset + len(chunks)
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
def chunk_document(
|
| 439 |
self,
|
| 440 |
markdown: str,
|
|
|
|
| 402 |
sliding_chunker = self._get_sliding_chunker()
|
| 403 |
chunks: list[Chunk] = []
|
| 404 |
|
| 405 |
+
raw_content = block.content
|
| 406 |
+
if self._is_code_block(raw_content):
|
| 407 |
+
token_count = sliding_chunker.tokenizer.count_tokens(raw_content)
|
| 408 |
+
chunk_id = _generate_chunk_id(source, base_page, chunk_index_offset)
|
| 409 |
+
chunk = Chunk(
|
| 410 |
+
chunk_id=chunk_id,
|
| 411 |
+
text=raw_content,
|
| 412 |
+
heading_path=list(block.heading_path),
|
| 413 |
+
source=source,
|
| 414 |
+
page=base_page,
|
| 415 |
+
start_char=0,
|
| 416 |
+
end_char=len(raw_content),
|
| 417 |
+
token_count=token_count,
|
| 418 |
+
)
|
| 419 |
+
chunks.append(chunk)
|
| 420 |
+
return chunks, chunk_index_offset + 1
|
| 421 |
+
|
| 422 |
# Normalize the content text
|
| 423 |
+
normalized_content = self._normalize_text(raw_content)
|
| 424 |
|
| 425 |
if not normalized_content.strip():
|
| 426 |
return chunks, chunk_index_offset
|
|
|
|
| 452 |
|
| 453 |
return chunks, chunk_index_offset + len(chunks)
|
| 454 |
|
| 455 |
+
@staticmethod
|
| 456 |
+
def _is_code_block(content: str) -> bool:
|
| 457 |
+
"""Return True when the content represents a standalone code block."""
|
| 458 |
+
if not content:
|
| 459 |
+
return False
|
| 460 |
+
stripped = content.lstrip()
|
| 461 |
+
if stripped.startswith("```") or stripped.startswith("~~~"):
|
| 462 |
+
return True
|
| 463 |
+
lines = [line for line in content.splitlines() if line.strip()]
|
| 464 |
+
if not lines:
|
| 465 |
+
return False
|
| 466 |
+
return all(line.startswith(" ") or line.startswith("\t") for line in lines)
|
| 467 |
+
|
| 468 |
def chunk_document(
|
| 469 |
self,
|
| 470 |
markdown: str,
|
src/rag_chatbot/chunking/models.py
CHANGED
|
@@ -161,7 +161,7 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
|
|
| 161 |
"ther mal com fort": "thermal comfort",
|
| 162 |
"ther mal": "thermal",
|
| 163 |
"com fort": "comfort",
|
| 164 |
-
# Common function names from the library
|
| 165 |
"pmv_ppd": "pmv_ppd",
|
| 166 |
"adaptive_ashrae": "adaptive_ashrae",
|
| 167 |
"adaptive_en": "adaptive_en",
|
|
@@ -183,6 +183,66 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
|
|
| 183 |
"two_nodes": "two_nodes",
|
| 184 |
"solar_altitude": "solar_altitude",
|
| 185 |
"mean_radiant_temperature": "mean_radiant_temperature",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
}
|
| 187 |
|
| 188 |
# Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
|
|
@@ -230,8 +290,19 @@ _HTML_COMMENT_PATTERN: re.Pattern[str] = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
| 230 |
|
| 231 |
# Technical terms that should NOT be segmented
|
| 232 |
# These are valid compound words or domain-specific terms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
_PROTECTED_TERMS: frozenset[str] = frozenset(
|
| 234 |
{
|
|
|
|
| 235 |
"pythermalcomfort",
|
| 236 |
"thermalcomfort",
|
| 237 |
"metabolicrate",
|
|
@@ -242,7 +313,60 @@ _PROTECTED_TERMS: frozenset[str] = frozenset(
|
|
| 242 |
"physiological",
|
| 243 |
"temperature",
|
| 244 |
"temperatures",
|
| 245 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
}
|
| 247 |
)
|
| 248 |
|
|
@@ -600,6 +724,14 @@ class TextNormalizer:
|
|
| 600 |
result_words.append(word)
|
| 601 |
continue
|
| 602 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
# Skip protected terms (case-insensitive)
|
| 604 |
if stripped.lower() in _PROTECTED_TERMS:
|
| 605 |
result_words.append(word)
|
|
|
|
| 161 |
"ther mal com fort": "thermal comfort",
|
| 162 |
"ther mal": "thermal",
|
| 163 |
"com fort": "comfort",
|
| 164 |
+
# Common function names from the library (preserving underscores)
|
| 165 |
"pmv_ppd": "pmv_ppd",
|
| 166 |
"adaptive_ashrae": "adaptive_ashrae",
|
| 167 |
"adaptive_en": "adaptive_en",
|
|
|
|
| 183 |
"two_nodes": "two_nodes",
|
| 184 |
"solar_altitude": "solar_altitude",
|
| 185 |
"mean_radiant_temperature": "mean_radiant_temperature",
|
| 186 |
+
# =========================================================================
|
| 187 |
+
# pythermalcomfort function name corrections
|
| 188 |
+
# Maps concatenated versions (without underscores) to correct snake_case
|
| 189 |
+
# This handles cases where PDF extraction strips underscores from names.
|
| 190 |
+
# =========================================================================
|
| 191 |
+
# Models - PMV/PPD variants
|
| 192 |
+
"pmvppdashrae": "pmv_ppd_ashrae",
|
| 193 |
+
"pmvppdiso": "pmv_ppd_iso",
|
| 194 |
+
"pmvathb": "pmv_athb",
|
| 195 |
+
"pmva": "pmv_a",
|
| 196 |
+
"pmve": "pmv_e",
|
| 197 |
+
# Models - Adaptive comfort
|
| 198 |
+
"adaptiveashrae": "adaptive_ashrae",
|
| 199 |
+
"adaptiveen": "adaptive_en",
|
| 200 |
+
# Models - Two-node models
|
| 201 |
+
"twonodesgagge": "two_nodes_gagge",
|
| 202 |
+
"twonodesgaggesleep": "two_nodes_gagge_sleep",
|
| 203 |
+
"twonodesgaggeji": "two_nodes_gagge_ji",
|
| 204 |
+
# Models - Heat indices
|
| 205 |
+
"heatindexlu": "heat_index_lu",
|
| 206 |
+
"heatindexrothfusz": "heat_index_rothfusz",
|
| 207 |
+
"discomfortindex": "discomfort_index",
|
| 208 |
+
# Models - Other thermal indices
|
| 209 |
+
"petsteady": "pet_steady",
|
| 210 |
+
"settmp": "set_tmp",
|
| 211 |
+
"coolingeffect": "cooling_effect",
|
| 212 |
+
"solargain": "solar_gain",
|
| 213 |
+
"usefansheatwaves": "use_fans_heatwaves",
|
| 214 |
+
"verticaltmpgradppd": "vertical_tmp_grad_ppd",
|
| 215 |
+
"ankledraft": "ankle_draft",
|
| 216 |
+
"clotout": "clo_tout",
|
| 217 |
+
# Models - Work capacity
|
| 218 |
+
"workcapacitydunne": "work_capacity_dunne",
|
| 219 |
+
"workcapacityhothaps": "work_capacity_hothaps",
|
| 220 |
+
"workcapacityiso": "work_capacity_iso",
|
| 221 |
+
"workcapacityniosh": "work_capacity_niosh",
|
| 222 |
+
# Models - Wind chill
|
| 223 |
+
"windchilltemperature": "wind_chill_temperature",
|
| 224 |
+
# Utilities - Temperature and psychrometrics
|
| 225 |
+
"runningmeanoutdoortemperature": "running_mean_outdoor_temperature",
|
| 226 |
+
"meanradianttmp": "mean_radiant_tmp",
|
| 227 |
+
"operativetmp": "operative_tmp",
|
| 228 |
+
"dewpointtmp": "dew_point_tmp",
|
| 229 |
+
"wetbulbtmp": "wet_bulb_tmp",
|
| 230 |
+
"enthalpyair": "enthalpy_air",
|
| 231 |
+
"bodysurfacearea": "body_surface_area",
|
| 232 |
+
"psytarh": "psy_ta_rh",
|
| 233 |
+
"vrelative": "v_relative",
|
| 234 |
+
"unitsconverter": "units_converter",
|
| 235 |
+
# Utilities - Clothing functions
|
| 236 |
+
"clodynamicashrae": "clo_dynamic_ashrae",
|
| 237 |
+
"clodynamiciso": "clo_dynamic_iso",
|
| 238 |
+
"cloinsulationairlayer": "clo_insulation_air_layer",
|
| 239 |
+
"cloareafactor": "clo_area_factor",
|
| 240 |
+
"clocorrectionfactorenvironment": "clo_correction_factor_environment",
|
| 241 |
+
"clointrinsicinsulatioensemble": "clo_intrinsic_insulation_ensemble",
|
| 242 |
+
"clototalinsulation": "clo_total_insulation",
|
| 243 |
+
"clotypicalensembles": "clo_typical_ensembles",
|
| 244 |
+
"cloindividualgarments": "clo_individual_garments",
|
| 245 |
+
"mettypicaltasks": "met_typical_tasks",
|
| 246 |
}
|
| 247 |
|
| 248 |
# Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
|
|
|
|
| 290 |
|
| 291 |
# Technical terms that should NOT be segmented
|
| 292 |
# These are valid compound words or domain-specific terms
|
| 293 |
+
#
|
| 294 |
+
# IMPORTANT: This list includes pythermalcomfort function names in their
|
| 295 |
+
# concatenated form (without underscores) because PDF extraction sometimes
|
| 296 |
+
# strips underscores. When a word like "pmvppdashrae" is encountered, it
|
| 297 |
+
# should NOT be segmented into "pmv ppd ashrae" - instead, it should be
|
| 298 |
+
# preserved so that downstream processing or the LLM can recognise it as
|
| 299 |
+
# a function name variant.
|
| 300 |
+
#
|
| 301 |
+
# The function names are extracted from:
|
| 302 |
+
# pythermalcomfort-readthedocs-io-en-latest.pdf (official documentation)
|
| 303 |
_PROTECTED_TERMS: frozenset[str] = frozenset(
|
| 304 |
{
|
| 305 |
+
# General technical terms
|
| 306 |
"pythermalcomfort",
|
| 307 |
"thermalcomfort",
|
| 308 |
"metabolicrate",
|
|
|
|
| 313 |
"physiological",
|
| 314 |
"temperature",
|
| 315 |
"temperatures",
|
| 316 |
+
# =====================================================================
|
| 317 |
+
# pythermalcomfort.models function names (concatenated, lowercase)
|
| 318 |
+
# These protect against incorrect segmentation of function names
|
| 319 |
+
# when underscores are stripped during PDF extraction.
|
| 320 |
+
# =====================================================================
|
| 321 |
+
"adaptiveashrae", # adaptive_ashrae
|
| 322 |
+
"adaptiveen", # adaptive_en
|
| 323 |
+
"ankledraft", # ankle_draft
|
| 324 |
+
"clotout", # clo_tout
|
| 325 |
+
"coolingeffect", # cooling_effect
|
| 326 |
+
"discomfortindex", # discomfort_index
|
| 327 |
+
"twonodesgagge", # two_nodes_gagge
|
| 328 |
+
"twonodesgaggesleep", # two_nodes_gagge_sleep
|
| 329 |
+
"twonodesgaggeji", # two_nodes_gagge_ji
|
| 330 |
+
"heatindexlu", # heat_index_lu
|
| 331 |
+
"heatindexrothfusz", # heat_index_rothfusz
|
| 332 |
+
"petsteady", # pet_steady
|
| 333 |
+
"pmvppdiso", # pmv_ppd_iso
|
| 334 |
+
"pmvppdashrae", # pmv_ppd_ashrae
|
| 335 |
+
"pmvathb", # pmv_athb
|
| 336 |
+
"solargain", # solar_gain
|
| 337 |
+
"settmp", # set_tmp
|
| 338 |
+
"usefansheatwaves", # use_fans_heatwaves
|
| 339 |
+
"verticaltmpgradppd", # vertical_tmp_grad_ppd
|
| 340 |
+
"windchilltemperature", # wind_chill_temperature
|
| 341 |
+
"workcapacitydunne", # work_capacity_dunne
|
| 342 |
+
"workcapacityhothaps", # work_capacity_hothaps
|
| 343 |
+
"workcapacityiso", # work_capacity_iso
|
| 344 |
+
"workcapacityniosh", # work_capacity_niosh
|
| 345 |
+
# =====================================================================
|
| 346 |
+
# pythermalcomfort.utilities function names (concatenated, lowercase)
|
| 347 |
+
# =====================================================================
|
| 348 |
+
"runningmeanoutdoortemperature", # running_mean_outdoor_temperature
|
| 349 |
+
"vrelative", # v_relative
|
| 350 |
+
"clodynamicashrae", # clo_dynamic_ashrae
|
| 351 |
+
"clodynamiciso", # clo_dynamic_iso
|
| 352 |
+
"bodysurfacearea", # body_surface_area
|
| 353 |
+
"dewpointtmp", # dew_point_tmp
|
| 354 |
+
"enthalpyair", # enthalpy_air
|
| 355 |
+
"meanradianttmp", # mean_radiant_tmp
|
| 356 |
+
"operativetmp", # operative_tmp
|
| 357 |
+
"psytarh", # psy_ta_rh
|
| 358 |
+
"psat", # p_sat
|
| 359 |
+
"fsvv", # f_svv
|
| 360 |
+
"unitsconverter", # units_converter
|
| 361 |
+
"wetbulbtmp", # wet_bulb_tmp
|
| 362 |
+
"cloinsulationairlayer", # clo_insulation_air_layer
|
| 363 |
+
"cloareafactor", # clo_area_factor
|
| 364 |
+
"clocorrectionfactorenvironment", # clo_correction_factor_environment
|
| 365 |
+
"clointrinsicinsulatioensemble", # clo_intrinsic_insulation_ensemble
|
| 366 |
+
"clototalinsulation", # clo_total_insulation
|
| 367 |
+
"clotypicalensembles", # clo_typical_ensembles
|
| 368 |
+
"cloindividualgarments", # clo_individual_garments
|
| 369 |
+
"mettypicaltasks", # met_typical_tasks
|
| 370 |
}
|
| 371 |
)
|
| 372 |
|
|
|
|
| 724 |
result_words.append(word)
|
| 725 |
continue
|
| 726 |
|
| 727 |
+
# Skip words containing underscores - these are Python identifiers
|
| 728 |
+
# (e.g., pmv_ppd_ashrae, clo_dynamic_iso) that should be preserved
|
| 729 |
+
# exactly as-is. Underscores in function names are intentional and
|
| 730 |
+
# segmenting them would corrupt the identifier.
|
| 731 |
+
if "_" in stripped:
|
| 732 |
+
result_words.append(word)
|
| 733 |
+
continue
|
| 734 |
+
|
| 735 |
# Skip protected terms (case-insensitive)
|
| 736 |
if stripped.lower() in _PROTECTED_TERMS:
|
| 737 |
result_words.append(word)
|
src/rag_chatbot/llm/prompts.py
CHANGED
|
@@ -163,6 +163,22 @@ ISO 7243, ISO 7933
|
|
| 163 |
- The pythermalcomfort Python library: available functions, required \
|
| 164 |
parameters, return values, and practical usage examples
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
**Response Guidelines:**
|
| 167 |
1. ONLY answer questions using information from the provided context. If the \
|
| 168 |
context does not contain enough information to fully answer a question, \
|
|
@@ -189,14 +205,20 @@ cited sources in numbered order.
|
|
| 189 |
3. For code examples, use accurate pythermalcomfort library syntax. Always \
|
| 190 |
include necessary imports and realistic parameter values with proper units.
|
| 191 |
|
| 192 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
- Temperature: °C (degrees Celsius)
|
| 194 |
- Air velocity: m/s (meters per second)
|
| 195 |
- Metabolic rate: met (1 met = 58.2 W/m²)
|
| 196 |
- Clothing insulation: clo (1 clo = 0.155 m²·K/W)
|
| 197 |
- Relative humidity: % (percentage)
|
| 198 |
|
| 199 |
-
|
| 200 |
dollar sign delimiters:
|
| 201 |
- Inline math: Use single dollar signs for ANY math expression, including \
|
| 202 |
subscripts, superscripts, and variables. Examples:
|
|
@@ -210,11 +232,11 @@ equations, e.g.,
|
|
| 210 |
dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
|
| 211 |
- Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
|
| 212 |
|
| 213 |
-
|
| 214 |
standard, thermal comfort model, environmental conditions, or use case the \
|
| 215 |
user is interested in.
|
| 216 |
|
| 217 |
-
|
| 218 |
matter and how they relate to human comfort, health, and building design.
|
| 219 |
|
| 220 |
**Response Formatting:**
|
|
|
|
| 163 |
- The pythermalcomfort Python library: available functions, required \
|
| 164 |
parameters, return values, and practical usage examples
|
| 165 |
|
| 166 |
+
**Important Limitations:**
|
| 167 |
+
You are a documentation assistant, NOT a calculator or code execution environment. \
|
| 168 |
+
You CANNOT:
|
| 169 |
+
- Execute code or perform calculations directly
|
| 170 |
+
- Run the pythermalcomfort library functions
|
| 171 |
+
- Generate or visualise results from calculations
|
| 172 |
+
|
| 173 |
+
You CAN:
|
| 174 |
+
- Explain thermal comfort concepts and models
|
| 175 |
+
- Provide ready-to-run Python code snippets that users can copy and execute themselves
|
| 176 |
+
- Answer questions about function parameters, return values, and usage patterns
|
| 177 |
+
|
| 178 |
+
When asked "what can you do", describe yourself as a documentation assistant that \
|
| 179 |
+
explains concepts and provides code examples - never suggest you can perform \
|
| 180 |
+
calculations or execute code.
|
| 181 |
+
|
| 182 |
**Response Guidelines:**
|
| 183 |
1. ONLY answer questions using information from the provided context. If the \
|
| 184 |
context does not contain enough information to fully answer a question, \
|
|
|
|
| 205 |
3. For code examples, use accurate pythermalcomfort library syntax. Always \
|
| 206 |
include necessary imports and realistic parameter values with proper units.
|
| 207 |
|
| 208 |
+
4. **Function Naming Convention:**
|
| 209 |
+
pythermalcomfort uses **snake_case** (lowercase with underscores) for ALL \
|
| 210 |
+
function names. Use exact names from the retrieved context (e.g., `pmv_ppd_ashrae()`, \
|
| 211 |
+
`adaptive_en()`, `two_nodes_gagge()`). Standard parameter names: `tdb`, `tr`, `v`, \
|
| 212 |
+
`vr`, `rh`, `met`, `clo`, `wme`.
|
| 213 |
+
|
| 214 |
+
5. Be precise with technical terminology and units:
|
| 215 |
- Temperature: °C (degrees Celsius)
|
| 216 |
- Air velocity: m/s (meters per second)
|
| 217 |
- Metabolic rate: met (1 met = 58.2 W/m²)
|
| 218 |
- Clothing insulation: clo (1 clo = 0.155 m²·K/W)
|
| 219 |
- Relative humidity: % (percentage)
|
| 220 |
|
| 221 |
+
6. For mathematical formulas and equations, you MUST use LaTeX syntax with \
|
| 222 |
dollar sign delimiters:
|
| 223 |
- Inline math: Use single dollar signs for ANY math expression, including \
|
| 224 |
subscripts, superscripts, and variables. Examples:
|
|
|
|
| 232 |
dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
|
| 233 |
- Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
|
| 234 |
|
| 235 |
+
7. If a question is ambiguous, ask for clarification about the specific \
|
| 236 |
standard, thermal comfort model, environmental conditions, or use case the \
|
| 237 |
user is interested in.
|
| 238 |
|
| 239 |
+
8. When explaining thermal comfort concepts, provide context about why they \
|
| 240 |
matter and how they relate to human comfort, health, and building design.
|
| 241 |
|
| 242 |
**Response Formatting:**
|