Spaces:

shelfgot
/

talmud-language-classifier

Sleeping

App Files Files Community

shelfgot commited on Nov 11, 2025

Commit

be1bc6c

verified ·

1 Parent(s): 3319ff1

Update predict.py

Browse files

Files changed (1) hide show

predict.py +21 -2

predict.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch
 import requests
 import os
 import re
 from train import TalmudClassifierLSTM, TalmudDataset, MAX_LEN
 # Preprocessing regex to match Vercel's preprocessing exactly
@@ -94,6 +95,12 @@ def fetch_daf_texts(vercel_base_url: str, auth_token: str) -> list:
 def text_to_sequence(text: str, word_to_idx: dict) -> list:
     """Convert text to sequence of word indices"""
     words = text.split()
     return [word_to_idx.get(word, word_to_idx['<UNK>']) for word in words]
@@ -156,8 +163,20 @@ def generate_predictions_for_daf(
                 char_pos = found_pos + len(word)
                 word_idx += 1
             else:
-                # Skip this word if we can't find it
-                break
     # Use sliding window approach
     window_size = max_len

 import requests
 import os
 import re
+import warnings
 from train import TalmudClassifierLSTM, TalmudDataset, MAX_LEN
 # Preprocessing regex to match Vercel's preprocessing exactly
 def text_to_sequence(text: str, word_to_idx: dict) -> list:
     """Convert text to sequence of word indices"""
+    # Validate that required keys exist
+    if '<UNK>' not in word_to_idx:
+        raise ValueError("Vocabulary must contain '<UNK>' key")
+    if '<PAD>' not in word_to_idx:
+        raise ValueError("Vocabulary must contain '<PAD>' key")
     words = text.split()
     return [word_to_idx.get(word, word_to_idx['<UNK>']) for word in words]
                 char_pos = found_pos + len(word)
                 word_idx += 1
             else:
+                # Couldn't find word - this indicates a mismatch between words and preprocessed_text
+                # This can happen if preprocessing changed the text in an unexpected way
+                # Log a warning and use a fallback: estimate position based on character count
+                warnings.warn(f"Word '{word}' at index {word_idx} not found in preprocessed text. Using estimated position.")
+                # Estimate position: assume words are separated by single spaces
+                estimated_start = char_pos
+                estimated_end = estimated_start + len(word)
+                word_boundaries.append((estimated_start, min(estimated_end, len(preprocessed_text))))
+                char_pos = estimated_end
+                word_idx += 1
+    # Validate that we found boundaries for all words
+    if len(word_boundaries) < len(words):
+        warnings.warn(f"Only found boundaries for {len(word_boundaries)} out of {len(words)} words. Some predictions may be inaccurate.")
     # Use sliding window approach
     window_size = max_len