Spaces:

sadickam
/

pythermalcomfort_Chat

Running

App Files Files Community

sadickam commited on Jan 19

Commit

3a4f3dc

1 Parent(s): d93fa4c

feat: enhance chunking and extraction processes with debug artifact handling and function name corrections

Browse files

Files changed (6) hide show

.gitignore +3 -0
scripts/chunk.py +17 -2
scripts/extract.py +49 -0
src/rag_chatbot/chunking/chunker.py +31 -1
src/rag_chatbot/chunking/models.py +134 -2
src/rag_chatbot/llm/prompts.py +26 -4

.gitignore CHANGED Viewed

@@ -165,3 +165,6 @@ tests/
 !.pre-commit-config.yaml
 !.env.example
 !.dockerignore

 !.pre-commit-config.yaml
 !.env.example
 !.dockerignore
+# Local commands/notes
+commands.md

scripts/chunk.py CHANGED Viewed

@@ -385,6 +385,11 @@ def _get_chunker() -> Chunker:
 # =============================================================================
 def _compute_file_hash(path: Path) -> str:
     """Compute SHA-256 hash of a file's contents.
@@ -661,7 +666,11 @@ def run_chunking(  # noqa: PLR0912, PLR0915
     # -------------------------------------------------------------------------
     # Find all Markdown files in input directory (non-recursive, top-level only)
     # -------------------------------------------------------------------------
-    md_files = sorted([f for f in input_dir.glob("*.md") if f.is_file()])
     total_files = len(md_files)
     # -------------------------------------------------------------------------
@@ -931,7 +940,13 @@ def main(argv: list[str] | None = None) -> int:  # noqa: PLR0911
     # -------------------------------------------------------------------------
     # Check for Markdown files
     # -------------------------------------------------------------------------
-    md_files = list(args.input_dir.glob("*.md"))
     if not md_files:
         print(
             f"Error: No markdown files found in {args.input_dir}",

 # =============================================================================
+def _is_debug_artifact(path: Path) -> bool:
+    """Return True if the path is a debug artifact that should be excluded."""
+    return path.name.endswith(".raw.md")
 def _compute_file_hash(path: Path) -> str:
     """Compute SHA-256 hash of a file's contents.
     # -------------------------------------------------------------------------
     # Find all Markdown files in input directory (non-recursive, top-level only)
     # -------------------------------------------------------------------------
+    all_md_files = sorted([f for f in input_dir.glob("*.md") if f.is_file()])
+    md_files = [f for f in all_md_files if not _is_debug_artifact(f)]
+    skipped_debug = len(all_md_files) - len(md_files)
+    if skipped_debug > 0 and verbose and not quiet:
+        print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
     total_files = len(md_files)
     # -------------------------------------------------------------------------
     # -------------------------------------------------------------------------
     # Check for Markdown files
     # -------------------------------------------------------------------------
+    md_files = [f for f in args.input_dir.glob("*.md") if not _is_debug_artifact(f)]
+    if args.verbose and args.quiet:
+        pass
+    elif args.verbose:
+        skipped_debug = len(list(args.input_dir.glob("*.md"))) - len(md_files)
+        if skipped_debug > 0:
+            print(f"Skipping {skipped_debug} debug artifact(s) (*.raw.md)")
     if not md_files:
         print(
             f"Error: No markdown files found in {args.input_dir}",

scripts/extract.py CHANGED Viewed

@@ -42,6 +42,7 @@ Note:
 from __future__ import annotations
 import argparse
 import sys
 import time
 from dataclasses import dataclass
@@ -216,6 +217,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
             - force: bool - Whether to overwrite existing files
             - verbose: bool - Whether to show detailed output
             - quiet: bool - Whether to suppress output
     Raises:
     ------
@@ -296,6 +298,14 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=False,
         help="Suppress all output except errors (still shows summary)",
     )
     # -------------------------------------------------------------------------
     # Parse and return arguments
@@ -425,6 +435,7 @@ def run_extraction(  # noqa: PLR0912, PLR0915
     force: bool,
     verbose: bool,
     quiet: bool,
 ) -> ExtractionStatistics:
     """Run the PDF extraction process on all PDF files in the input directory.
@@ -449,6 +460,8 @@ def run_extraction(  # noqa: PLR0912, PLR0915
             If True, print detailed information including file names.
         quiet : bool
             If True, suppress progress bar (but still print summary).
     Returns:
     -------
@@ -545,10 +558,22 @@ def run_extraction(  # noqa: PLR0912, PLR0915
     # -------------------------------------------------------------------------
     # Process each PDF file
     # -------------------------------------------------------------------------
     for idx, pdf_path in enumerate(progress_bar):
         # Determine output path
         md_filename = pdf_path.stem + ".md"
         md_path = output_dir / md_filename
         # Check if extraction is needed
         if not _should_extract(pdf_path, md_path, force):
@@ -572,7 +597,30 @@ def run_extraction(  # noqa: PLR0912, PLR0915
             # Get markdown content and apply converter
             raw_markdown = document.to_markdown()
             clean_markdown = converter.convert(raw_markdown)
             # Write output file
             md_path.write_text(clean_markdown, encoding="utf-8")
@@ -701,6 +749,7 @@ def main(argv: list[str] | None = None) -> int:  # noqa: PLR0911
         force=args.force,
         verbose=args.verbose,
         quiet=args.quiet,
     )
     # -------------------------------------------------------------------------

 from __future__ import annotations
 import argparse
+import hashlib
 import sys
 import time
 from dataclasses import dataclass
             - force: bool - Whether to overwrite existing files
             - verbose: bool - Whether to show detailed output
             - quiet: bool - Whether to suppress output
+            - dump_raw_for: str | None - Optional PDF filename/stem to dump raw output
     Raises:
     ------
         default=False,
         help="Suppress all output except errors (still shows summary)",
     )
+    parser.add_argument(
+        "--dump-raw-for",
+        default=None,
+        help=(
+            "Dump raw pymupdf4llm markdown (pre-MarkdownConverter) for a "
+            "matching PDF filename or stem to <output_dir>/<stem>.raw.md"
+        ),
+    )
     # -------------------------------------------------------------------------
     # Parse and return arguments
     force: bool,
     verbose: bool,
     quiet: bool,
+    dump_raw_for: str | None = None,
 ) -> ExtractionStatistics:
     """Run the PDF extraction process on all PDF files in the input directory.
             If True, print detailed information including file names.
         quiet : bool
             If True, suppress progress bar (but still print summary).
+        dump_raw_for : str | None
+            Optional PDF filename or stem to dump raw markdown for.
     Returns:
     -------
     # -------------------------------------------------------------------------
     # Process each PDF file
     # -------------------------------------------------------------------------
+    raw_target = (dump_raw_for or "").strip()
+    raw_target_lower = raw_target.lower()
+    raw_target_is_filename = raw_target_lower.endswith(".pdf")
     for idx, pdf_path in enumerate(progress_bar):
         # Determine output path
         md_filename = pdf_path.stem + ".md"
         md_path = output_dir / md_filename
+        pdf_name_lower = pdf_path.name.lower()
+        pdf_stem_lower = pdf_path.stem.lower()
+        raw_dump_match = False
+        if raw_target_lower:
+            if raw_target_is_filename:
+                raw_dump_match = pdf_name_lower == raw_target_lower
+            else:
+                raw_dump_match = raw_target_lower in {pdf_name_lower, pdf_stem_lower}
         # Check if extraction is needed
         if not _should_extract(pdf_path, md_path, force):
             # Get markdown content and apply converter
             raw_markdown = document.to_markdown()
+            if raw_dump_match:
+                raw_dump_path = output_dir / f"{pdf_path.stem}.raw.md"
+                raw_dump_path.write_text(raw_markdown, encoding="utf-8")
+                if verbose:
+                    print(f"    Wrote raw markdown: {raw_dump_path}")
+            if verbose and raw_dump_match:
+                raw_hash = hashlib.sha256(raw_markdown.encode("utf-8")).hexdigest()[:12]
+                raw_underscore_count = raw_markdown.count("_")
+                print(
+                    "    Raw checksum/underscores:",
+                    raw_hash,
+                    f"underscores={raw_underscore_count}",
+                )
             clean_markdown = converter.convert(raw_markdown)
+            if verbose and raw_dump_match:
+                clean_hash = hashlib.sha256(clean_markdown.encode("utf-8")).hexdigest()[
+                    :12
+                ]
+                clean_underscore_count = clean_markdown.count("_")
+                print(
+                    "    Clean checksum/underscores:",
+                    clean_hash,
+                    f"underscores={clean_underscore_count}",
+                )
             # Write output file
             md_path.write_text(clean_markdown, encoding="utf-8")
         force=args.force,
         verbose=args.verbose,
         quiet=args.quiet,
+        dump_raw_for=args.dump_raw_for,
     )
     # -------------------------------------------------------------------------

src/rag_chatbot/chunking/chunker.py CHANGED Viewed

@@ -402,8 +402,25 @@ class Chunker:
         sliding_chunker = self._get_sliding_chunker()
         chunks: list[Chunk] = []
         # Normalize the content text
-        normalized_content = self._normalize_text(block.content)
         if not normalized_content.strip():
             return chunks, chunk_index_offset
@@ -435,6 +452,19 @@ class Chunker:
         return chunks, chunk_index_offset + len(chunks)
     def chunk_document(
         self,
         markdown: str,

         sliding_chunker = self._get_sliding_chunker()
         chunks: list[Chunk] = []
+        raw_content = block.content
+        if self._is_code_block(raw_content):
+            token_count = sliding_chunker.tokenizer.count_tokens(raw_content)
+            chunk_id = _generate_chunk_id(source, base_page, chunk_index_offset)
+            chunk = Chunk(
+                chunk_id=chunk_id,
+                text=raw_content,
+                heading_path=list(block.heading_path),
+                source=source,
+                page=base_page,
+                start_char=0,
+                end_char=len(raw_content),
+                token_count=token_count,
+            )
+            chunks.append(chunk)
+            return chunks, chunk_index_offset + 1
         # Normalize the content text
+        normalized_content = self._normalize_text(raw_content)
         if not normalized_content.strip():
             return chunks, chunk_index_offset
         return chunks, chunk_index_offset + len(chunks)
+    @staticmethod
+    def _is_code_block(content: str) -> bool:
+        """Return True when the content represents a standalone code block."""
+        if not content:
+            return False
+        stripped = content.lstrip()
+        if stripped.startswith("```") or stripped.startswith("~~~"):
+            return True
+        lines = [line for line in content.splitlines() if line.strip()]
+        if not lines:
+            return False
+        return all(line.startswith("    ") or line.startswith("\t") for line in lines)
     def chunk_document(
         self,
         markdown: str,

src/rag_chatbot/chunking/models.py CHANGED Viewed

@@ -161,7 +161,7 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
     "ther mal com fort": "thermal comfort",
     "ther mal": "thermal",
     "com fort": "comfort",
-    # Common function names from the library
     "pmv_ppd": "pmv_ppd",
     "adaptive_ashrae": "adaptive_ashrae",
     "adaptive_en": "adaptive_en",
@@ -183,6 +183,66 @@ THERMAL_COMFORT_TERMS: dict[str, str] = {
     "two_nodes": "two_nodes",
     "solar_altitude": "solar_altitude",
     "mean_radiant_temperature": "mean_radiant_temperature",
 }
 # Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
@@ -230,8 +290,19 @@ _HTML_COMMENT_PATTERN: re.Pattern[str] = re.compile(r"<!--.*?-->", re.DOTALL)
 # Technical terms that should NOT be segmented
 # These are valid compound words or domain-specific terms
 _PROTECTED_TERMS: frozenset[str] = frozenset(
     {
         "pythermalcomfort",
         "thermalcomfort",
         "metabolicrate",
@@ -242,7 +313,60 @@ _PROTECTED_TERMS: frozenset[str] = frozenset(
         "physiological",
         "temperature",
         "temperatures",
-        # Add more as needed
     }
 )
@@ -600,6 +724,14 @@ class TextNormalizer:
                 result_words.append(word)
                 continue
             # Skip protected terms (case-insensitive)
             if stripped.lower() in _PROTECTED_TERMS:
                 result_words.append(word)

     "ther mal com fort": "thermal comfort",
     "ther mal": "thermal",
     "com fort": "comfort",
+    # Common function names from the library (preserving underscores)
     "pmv_ppd": "pmv_ppd",
     "adaptive_ashrae": "adaptive_ashrae",
     "adaptive_en": "adaptive_en",
     "two_nodes": "two_nodes",
     "solar_altitude": "solar_altitude",
     "mean_radiant_temperature": "mean_radiant_temperature",
+    # =========================================================================
+    # pythermalcomfort function name corrections
+    # Maps concatenated versions (without underscores) to correct snake_case
+    # This handles cases where PDF extraction strips underscores from names.
+    # =========================================================================
+    # Models - PMV/PPD variants
+    "pmvppdashrae": "pmv_ppd_ashrae",
+    "pmvppdiso": "pmv_ppd_iso",
+    "pmvathb": "pmv_athb",
+    "pmva": "pmv_a",
+    "pmve": "pmv_e",
+    # Models - Adaptive comfort
+    "adaptiveashrae": "adaptive_ashrae",
+    "adaptiveen": "adaptive_en",
+    # Models - Two-node models
+    "twonodesgagge": "two_nodes_gagge",
+    "twonodesgaggesleep": "two_nodes_gagge_sleep",
+    "twonodesgaggeji": "two_nodes_gagge_ji",
+    # Models - Heat indices
+    "heatindexlu": "heat_index_lu",
+    "heatindexrothfusz": "heat_index_rothfusz",
+    "discomfortindex": "discomfort_index",
+    # Models - Other thermal indices
+    "petsteady": "pet_steady",
+    "settmp": "set_tmp",
+    "coolingeffect": "cooling_effect",
+    "solargain": "solar_gain",
+    "usefansheatwaves": "use_fans_heatwaves",
+    "verticaltmpgradppd": "vertical_tmp_grad_ppd",
+    "ankledraft": "ankle_draft",
+    "clotout": "clo_tout",
+    # Models - Work capacity
+    "workcapacitydunne": "work_capacity_dunne",
+    "workcapacityhothaps": "work_capacity_hothaps",
+    "workcapacityiso": "work_capacity_iso",
+    "workcapacityniosh": "work_capacity_niosh",
+    # Models - Wind chill
+    "windchilltemperature": "wind_chill_temperature",
+    # Utilities - Temperature and psychrometrics
+    "runningmeanoutdoortemperature": "running_mean_outdoor_temperature",
+    "meanradianttmp": "mean_radiant_tmp",
+    "operativetmp": "operative_tmp",
+    "dewpointtmp": "dew_point_tmp",
+    "wetbulbtmp": "wet_bulb_tmp",
+    "enthalpyair": "enthalpy_air",
+    "bodysurfacearea": "body_surface_area",
+    "psytarh": "psy_ta_rh",
+    "vrelative": "v_relative",
+    "unitsconverter": "units_converter",
+    # Utilities - Clothing functions
+    "clodynamicashrae": "clo_dynamic_ashrae",
+    "clodynamiciso": "clo_dynamic_iso",
+    "cloinsulationairlayer": "clo_insulation_air_layer",
+    "cloareafactor": "clo_area_factor",
+    "clocorrectionfactorenvironment": "clo_correction_factor_environment",
+    "clointrinsicinsulatioensemble": "clo_intrinsic_insulation_ensemble",
+    "clototalinsulation": "clo_total_insulation",
+    "clotypicalensembles": "clo_typical_ensembles",
+    "cloindividualgarments": "clo_individual_garments",
+    "mettypicaltasks": "met_typical_tasks",
 }
 # Regex pattern for detecting ALL CAPS words (3+ consecutive capital letters)
 # Technical terms that should NOT be segmented
 # These are valid compound words or domain-specific terms
+#
+# IMPORTANT: This list includes pythermalcomfort function names in their
+# concatenated form (without underscores) because PDF extraction sometimes
+# strips underscores. When a word like "pmvppdashrae" is encountered, it
+# should NOT be segmented into "pmv ppd ashrae" - instead, it should be
+# preserved so that downstream processing or the LLM can recognise it as
+# a function name variant.
+#
+# The function names are extracted from:
+# pythermalcomfort-readthedocs-io-en-latest.pdf (official documentation)
 _PROTECTED_TERMS: frozenset[str] = frozenset(
     {
+        # General technical terms
         "pythermalcomfort",
         "thermalcomfort",
         "metabolicrate",
         "physiological",
         "temperature",
         "temperatures",
+        # =====================================================================
+        # pythermalcomfort.models function names (concatenated, lowercase)
+        # These protect against incorrect segmentation of function names
+        # when underscores are stripped during PDF extraction.
+        # =====================================================================
+        "adaptiveashrae",      # adaptive_ashrae
+        "adaptiveen",          # adaptive_en
+        "ankledraft",          # ankle_draft
+        "clotout",             # clo_tout
+        "coolingeffect",       # cooling_effect
+        "discomfortindex",     # discomfort_index
+        "twonodesgagge",       # two_nodes_gagge
+        "twonodesgaggesleep",  # two_nodes_gagge_sleep
+        "twonodesgaggeji",     # two_nodes_gagge_ji
+        "heatindexlu",         # heat_index_lu
+        "heatindexrothfusz",   # heat_index_rothfusz
+        "petsteady",           # pet_steady
+        "pmvppdiso",           # pmv_ppd_iso
+        "pmvppdashrae",        # pmv_ppd_ashrae
+        "pmvathb",             # pmv_athb
+        "solargain",           # solar_gain
+        "settmp",              # set_tmp
+        "usefansheatwaves",    # use_fans_heatwaves
+        "verticaltmpgradppd",  # vertical_tmp_grad_ppd
+        "windchilltemperature",  # wind_chill_temperature
+        "workcapacitydunne",   # work_capacity_dunne
+        "workcapacityhothaps", # work_capacity_hothaps
+        "workcapacityiso",     # work_capacity_iso
+        "workcapacityniosh",   # work_capacity_niosh
+        # =====================================================================
+        # pythermalcomfort.utilities function names (concatenated, lowercase)
+        # =====================================================================
+        "runningmeanoutdoortemperature",  # running_mean_outdoor_temperature
+        "vrelative",           # v_relative
+        "clodynamicashrae",    # clo_dynamic_ashrae
+        "clodynamiciso",       # clo_dynamic_iso
+        "bodysurfacearea",     # body_surface_area
+        "dewpointtmp",         # dew_point_tmp
+        "enthalpyair",         # enthalpy_air
+        "meanradianttmp",      # mean_radiant_tmp
+        "operativetmp",        # operative_tmp
+        "psytarh",             # psy_ta_rh
+        "psat",                # p_sat
+        "fsvv",                # f_svv
+        "unitsconverter",      # units_converter
+        "wetbulbtmp",          # wet_bulb_tmp
+        "cloinsulationairlayer",       # clo_insulation_air_layer
+        "cloareafactor",       # clo_area_factor
+        "clocorrectionfactorenvironment",  # clo_correction_factor_environment
+        "clointrinsicinsulatioensemble",   # clo_intrinsic_insulation_ensemble
+        "clototalinsulation",  # clo_total_insulation
+        "clotypicalensembles", # clo_typical_ensembles
+        "cloindividualgarments",  # clo_individual_garments
+        "mettypicaltasks",     # met_typical_tasks
     }
 )
                 result_words.append(word)
                 continue
+            # Skip words containing underscores - these are Python identifiers
+            # (e.g., pmv_ppd_ashrae, clo_dynamic_iso) that should be preserved
+            # exactly as-is. Underscores in function names are intentional and
+            # segmenting them would corrupt the identifier.
+            if "_" in stripped:
+                result_words.append(word)
+                continue
             # Skip protected terms (case-insensitive)
             if stripped.lower() in _PROTECTED_TERMS:
                 result_words.append(word)

src/rag_chatbot/llm/prompts.py CHANGED Viewed

@@ -163,6 +163,22 @@ ISO 7243, ISO 7933
 - The pythermalcomfort Python library: available functions, required \
 parameters, return values, and practical usage examples
 **Response Guidelines:**
 1. ONLY answer questions using information from the provided context. If the \
 context does not contain enough information to fully answer a question, \
@@ -189,14 +205,20 @@ cited sources in numbered order.
 3. For code examples, use accurate pythermalcomfort library syntax. Always \
 include necessary imports and realistic parameter values with proper units.
-4. Be precise with technical terminology and units:
    - Temperature: °C (degrees Celsius)
    - Air velocity: m/s (meters per second)
    - Metabolic rate: met (1 met = 58.2 W/m²)
    - Clothing insulation: clo (1 clo = 0.155 m²·K/W)
    - Relative humidity: % (percentage)
-5. For mathematical formulas and equations, you MUST use LaTeX syntax with \
 dollar sign delimiters:
    - Inline math: Use single dollar signs for ANY math expression, including \
 subscripts, superscripts, and variables. Examples:
@@ -210,11 +232,11 @@ equations, e.g.,
 dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
    - Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
-6. If a question is ambiguous, ask for clarification about the specific \
 standard, thermal comfort model, environmental conditions, or use case the \
 user is interested in.
-7. When explaining thermal comfort concepts, provide context about why they \
 matter and how they relate to human comfort, health, and building design.
 **Response Formatting:**

 - The pythermalcomfort Python library: available functions, required \
 parameters, return values, and practical usage examples
+**Important Limitations:**
+You are a documentation assistant, NOT a calculator or code execution environment. \
+You CANNOT:
+- Execute code or perform calculations directly
+- Run the pythermalcomfort library functions
+- Generate or visualise results from calculations
+You CAN:
+- Explain thermal comfort concepts and models
+- Provide ready-to-run Python code snippets that users can copy and execute themselves
+- Answer questions about function parameters, return values, and usage patterns
+When asked "what can you do", describe yourself as a documentation assistant that \
+explains concepts and provides code examples - never suggest you can perform \
+calculations or execute code.
 **Response Guidelines:**
 1. ONLY answer questions using information from the provided context. If the \
 context does not contain enough information to fully answer a question, \
 3. For code examples, use accurate pythermalcomfort library syntax. Always \
 include necessary imports and realistic parameter values with proper units.
+4. **Function Naming Convention:**
+   pythermalcomfort uses **snake_case** (lowercase with underscores) for ALL \
+function names. Use exact names from the retrieved context (e.g., `pmv_ppd_ashrae()`, \
+`adaptive_en()`, `two_nodes_gagge()`). Standard parameter names: `tdb`, `tr`, `v`, \
+`vr`, `rh`, `met`, `clo`, `wme`.
+5. Be precise with technical terminology and units:
    - Temperature: °C (degrees Celsius)
    - Air velocity: m/s (meters per second)
    - Metabolic rate: met (1 met = 58.2 W/m²)
    - Clothing insulation: clo (1 clo = 0.155 m²·K/W)
    - Relative humidity: % (percentage)
+6. For mathematical formulas and equations, you MUST use LaTeX syntax with \
 dollar sign delimiters:
    - Inline math: Use single dollar signs for ANY math expression, including \
 subscripts, superscripts, and variables. Examples:
 dollar signs - this will not render. ALWAYS wrap math in $...$ or $$...$$
    - Do NOT use \\[...\\] or \\(...\\) delimiters - only use $ and $$ delimiters.
+7. If a question is ambiguous, ask for clarification about the specific \
 standard, thermal comfort model, environmental conditions, or use case the \
 user is interested in.
+8. When explaining thermal comfort concepts, provide context about why they \
 matter and how they relate to human comfort, health, and building design.
 **Response Formatting:**