Spaces:

shelfgot
/

talmud-language-classifier

Sleeping

App Files Files Community

shelfgot commited on Nov 11, 2025

Commit

8176e08

verified ·

1 Parent(s): 172b660

single predictions

Browse files

Files changed (1) hide show

predict.py +171 -53

predict.py CHANGED Viewed

@@ -6,15 +6,61 @@ Generates predictions for all dafim using a trained model
 import torch
 import requests
 import os
 from train import TalmudClassifierLSTM, TalmudDataset, MAX_LEN
-# Preprocessing regex to match Vercel's preprocessing
-PREPROCESSING_REGEX = r'[\u0591-\u05C7]|[,\-?!:\.״]+|<big><strong>|<\/strong><\/big>'
-import re
-def preprocess_text(text: str) -> str:
-    """Preprocess text by removing nikud and punctuation"""
-    return re.sub(PREPROCESSING_REGEX, '', text)
 def fetch_daf_texts(vercel_base_url: str, auth_token: str) -> list:
     """
@@ -29,38 +75,21 @@ def fetch_daf_texts(vercel_base_url: str, auth_token: str) -> list:
     print(f"Fetching daf texts from {url}...")
     try:
         headers = {
             'x-auth-token': auth_token,
             'Content-Type': 'application/json'
         }
-        vercel_bypass_token = os.getenv('VERCEL_BYPASS_TOKEN')
-        if vercel_bypass_token:
-            separator = '&' if '?' in url else '?'
-            url = f"{url}{separator}x-vercel-set-bypass-cookie=true&x-vercel-protection-bypass={vercel_bypass_token}"
-            print(f"Using Vercel bypass token for deployment protection")
         response = requests.get(url, headers=headers, timeout=60)
         response.raise_for_status()
         data = response.json()
         print(f"Fetched {data.get('count', 0)} dafim")
         return data.get('dafim', [])
-    except requests.exceptions.HTTPError as e:
-        print(f"HTTP Error fetching daf texts: {e}")
-        if hasattr(e, 'response') and e.response is not None:
-            print(f"Response status: {e.response.status_code}")
-            # Print first 500 chars of response for debugging
-            response_text = e.response.text[:500] if e.response.text else "No response text"
-            print(f"Response text (first 500 chars): {response_text}")
-            # Check if it's a deployment protection issue
-            if e.response.status_code == 401 and 'Authentication Required' in response_text:
-                print("ERROR: Deployment protection is blocking the request.")
-                print("Make sure VERCEL_BYPASS_TOKEN is set correctly in HF Space environment variables.")
-        raise
     except Exception as e:
         print(f"Error fetching daf texts: {e}")
         raise
 def text_to_sequence(text: str, word_to_idx: dict) -> list:
@@ -76,22 +105,60 @@ def generate_predictions_for_daf(
     max_len: int = MAX_LEN
 ) -> list:
     """
-    Generate predictions for a single daf text.
     Returns list of ranges: [{'start': int, 'end': int, 'type': int}, ...]
     Strategy: Sliding window approach - predict on overlapping windows of text
     """
     model.eval()
-    # Preprocess the text (should already be preprocessed, but be safe)
-    preprocessed_text = preprocess_text(daf_text)
-    # Split into words
     words = preprocessed_text.split()
     if len(words) == 0:
         return []
     # Use sliding window approach
     window_size = max_len
     stride = window_size // 2  # 50% overlap
@@ -124,32 +191,73 @@ def generate_predictions_for_daf(
             _, predicted = torch.max(output.data, 1)
             predicted_label_idx = predicted.item()
-            # Calculate character positions in original text
-            # Find the start position of this window in the original text
-            window_text = ' '.join(window_words)
-            # Find start position by searching in original text
-            search_start = 0
-            if i > 0:
-                # Approximate position based on previous windows
-                search_start = len(' '.join(words[:i]))
-            # Find actual position in preprocessed text
-            window_start_char = preprocessed_text.find(window_text, search_start)
-            if window_start_char == -1:
-                # Fallback: estimate position
-                window_start_char = len(' '.join(words[:i])) if i > 0 else 0
-            # Use the most confident prediction for the window center
-            # For simplicity, predict the entire window as the predicted class
-            window_end_char = window_start_char + len(window_text)
             # Only add if we have a valid range
-            if window_end_char > window_start_char:
                 ranges.append({
-                    'start': window_start_char,
-                    'end': window_end_char,
                     'type': int(predicted_label_idx)
                 })
@@ -185,9 +293,15 @@ def generate_all_predictions(
     auth_token: str
 ) -> list:
     """
     Generate predictions for all dafim.
     Returns list of prediction objects: [{'daf_id': str, 'ranges': [...]}, ...]
     Args:
         model: Trained model
         word_to_idx: Word to index mapping
@@ -195,6 +309,7 @@ def generate_all_predictions(
         vercel_base_url: Base URL of the Vercel app
         auth_token: Authentication token for Vercel API (TRAINING_CALLBACK_TOKEN)
     """
     print("Fetching daf texts from Vercel...")
     dafim = fetch_daf_texts(vercel_base_url, auth_token)
@@ -212,7 +327,10 @@ def generate_all_predictions(
         try:
             daf_id = daf['id']
-            text_content = daf['text_content']  # Already preprocessed from API
             ranges = generate_predictions_for_daf(
                 model, text_content, word_to_idx, label_encoder

 import torch
 import requests
 import os
+import re
 from train import TalmudClassifierLSTM, TalmudDataset, MAX_LEN
+# Preprocessing regex to match Vercel's preprocessing exactly
+# Vercel uses: /[\u0591-\u05C7]|[,\-?!:\.״]+|<[^>]+>/g
+PREPROCESSING_REGEX = re.compile(r'[\u0591-\u05C7]|[,\-?!:\.״]+|<[^>]+>')
+def preprocess_text(text: str) -> tuple[str, dict, dict]:
+    """
+    Preprocess text by removing nikud, punctuation, and HTML tags.
+    Matches Vercel's preprocessing exactly.
+    Returns (preprocessed_text, prep_to_orig, orig_to_prep) where:
+    - prep_to_orig maps preprocessed position -> original position
+    - orig_to_prep maps original position -> preprocessed position (or -1 if removed)
+    """
+    preprocessed = ''
+    prep_to_orig = {}  # Maps preprocessed_pos -> original_pos
+    orig_to_prep = {}  # Maps original_pos -> preprocessed_pos (or -1 if removed)
+    preprocessed_pos = 0
+    i = 0
+    # Process text character by character, handling HTML tags as units
+    while i < len(text):
+        # Check for HTML tags (they are removed as units)
+        if text[i] == '<':
+            # Find the end of the HTML tag
+            tag_end = text.find('>', i)
+            if tag_end != -1:
+                # Mark all characters in the tag as removed
+                for orig_pos in range(i, tag_end + 1):
+                    orig_to_prep[orig_pos] = -1
+                i = tag_end + 1
+                continue
+        char = text[i]
+        char_code = ord(char)
+        # Check if character should be removed:
+        # 1. Nikud range: \u0591-\u05C7 (0x0591 to 0x05C7)
+        # 2. Punctuation: , - ? ! : . ״
+        should_remove = (
+            (0x0591 <= char_code <= 0x05C7) or
+            char in [',', '-', '?', '!', ':', '.', '״']
+        )
+        if should_remove:
+            orig_to_prep[i] = -1  # Mark as removed
+        else:
+            prep_to_orig[preprocessed_pos] = i
+            orig_to_prep[i] = preprocessed_pos
+            preprocessed += char
+            preprocessed_pos += 1
+        i += 1
+    return preprocessed, prep_to_orig, orig_to_prep
 def fetch_daf_texts(vercel_base_url: str, auth_token: str) -> list:
     """
     print(f"Fetching daf texts from {url}...")
     try:
+        # Include authentication token in header
         headers = {
             'x-auth-token': auth_token,
             'Content-Type': 'application/json'
         }
         response = requests.get(url, headers=headers, timeout=60)
         response.raise_for_status()
         data = response.json()
         print(f"Fetched {data.get('count', 0)} dafim")
         return data.get('dafim', [])
     except Exception as e:
         print(f"Error fetching daf texts: {e}")
+        if hasattr(e, 'response') and e.response is not None:
+            print(f"Response status: {e.response.status_code}")
+            print(f"Response text: {e.response.text}")
         raise
 def text_to_sequence(text: str, word_to_idx: dict) -> list:
     max_len: int = MAX_LEN
 ) -> list:
     """
+    Generate predictions for a single daf text (original text, not preprocessed).
     Returns list of ranges: [{'start': int, 'end': int, 'type': int}, ...]
+    Positions are relative to the original text.
     Strategy: Sliding window approach - predict on overlapping windows of text
     """
     model.eval()
+    # Preprocess the text and get character mappings
+    preprocessed_text, prep_to_orig, orig_to_prep = preprocess_text(daf_text)
+    # Split into words and track character positions accurately
     words = preprocessed_text.split()
     if len(words) == 0:
         return []
+    # Build word boundaries in preprocessed text by tracking positions as we iterate
+    # This is more reliable than using find() which could match wrong occurrences
+    word_boundaries = []
+    char_pos = 0
+    word_idx = 0
+    # Iterate through preprocessed text to find word boundaries
+    while char_pos < len(preprocessed_text) and word_idx < len(words):
+        # Skip leading spaces
+        while char_pos < len(preprocessed_text) and preprocessed_text[char_pos] == ' ':
+            char_pos += 1
+        if char_pos >= len(preprocessed_text):
+            break
+        # Find the current word
+        word = words[word_idx]
+        word_start = char_pos
+        # Check if the word starts at this position
+        if preprocessed_text[char_pos:char_pos + len(word)] == word:
+            word_end = char_pos + len(word)
+            word_boundaries.append((word_start, word_end))
+            char_pos = word_end
+            word_idx += 1
+        else:
+            # Word doesn't match - this shouldn't happen, but handle gracefully
+            # Try to find the word starting from current position
+            found_pos = preprocessed_text.find(word, char_pos)
+            if found_pos != -1:
+                word_boundaries.append((found_pos, found_pos + len(word)))
+                char_pos = found_pos + len(word)
+                word_idx += 1
+            else:
+                # Skip this word if we can't find it
+                break
     # Use sliding window approach
     window_size = max_len
     stride = window_size // 2  # 50% overlap
             _, predicted = torch.max(output.data, 1)
             predicted_label_idx = predicted.item()
+            # Calculate character positions in preprocessed text using word boundaries
+            # Ensure we don't go out of bounds
+            if i >= len(word_boundaries):
+                continue
+            last_word_idx = min(i + len(window_words) - 1, len(word_boundaries) - 1)
+            if last_word_idx < i:
+                continue
+            # Start position is the start of the first word in the window
+            window_start_prep = word_boundaries[i][0]
+            # End position is the end of the last word in the window
+            window_end_prep = word_boundaries[last_word_idx][1]
             # Only add if we have a valid range
+            if window_end_prep > window_start_prep:
+                # Map preprocessed text positions to original text positions
+                # Find the original start position
+                original_start = prep_to_orig.get(window_start_prep)
+                if original_start is None:
+                    # Find the closest mapped position before or at window_start_prep
+                    for prep_pos in sorted(prep_to_orig.keys(), reverse=True):
+                        if prep_pos <= window_start_prep:
+                            original_start = prep_to_orig[prep_pos]
+                            break
+                    if original_start is None:
+                        continue  # Skip if we can't map start position
+                # Find the original end position
+                # window_end_prep points to the character after the last character in the window
+                # We need to map this to the original text
+                window_end_prep_clamped = min(window_end_prep, len(preprocessed_text))
+                # Find the original position corresponding to the end of the window
+                # If window_end_prep_clamped is at the end of preprocessed text, use end of original text
+                if window_end_prep_clamped >= len(preprocessed_text):
+                    original_end = len(daf_text)
+                else:
+                    # Find the original position for the character at window_end_prep_clamped
+                    # (the character right after the window ends)
+                    end_char_orig = prep_to_orig.get(window_end_prep_clamped)
+                    if end_char_orig is not None:
+                        original_end = end_char_orig
+                    else:
+                        # Character at window_end_prep_clamped was removed, find the next non-removed character
+                        # Look for the next preprocessed position >= window_end_prep_clamped
+                        next_prep_pos = None
+                        for prep_pos in sorted(prep_to_orig.keys()):
+                            if prep_pos >= window_end_prep_clamped:
+                                next_prep_pos = prep_pos
+                                break
+                        if next_prep_pos is not None:
+                            original_end = prep_to_orig[next_prep_pos]
+                        else:
+                            # No more characters in preprocessed text, use end of original text
+                            original_end = len(daf_text)
+                # Ensure end is after start and within bounds
+                if original_end <= original_start:
+                    # Fallback: ensure at least one character
+                    original_end = min(original_start + 1, len(daf_text))
+                original_end = min(original_end, len(daf_text))
                 ranges.append({
+                    'start': original_start,
+                    'end': original_end,
                     'type': int(predicted_label_idx)
                 })
     auth_token: str
 ) -> list:
     """
+    DEPRECATED: This function is no longer used in the training flow.
+    It's kept for reference but should not be called.
     Generate predictions for all dafim.
     Returns list of prediction objects: [{'daf_id': str, 'ranges': [...]}, ...]
+    NOTE: This function expects preprocessed text from the API, but generate_predictions_for_daf
+    now expects original text. This function needs to be updated if it's ever used again.
     Args:
         model: Trained model
         word_to_idx: Word to index mapping
         vercel_base_url: Base URL of the Vercel app
         auth_token: Authentication token for Vercel API (TRAINING_CALLBACK_TOKEN)
     """
+    print("WARNING: generate_all_predictions is deprecated and may not work correctly.")
     print("Fetching daf texts from Vercel...")
     dafim = fetch_daf_texts(vercel_base_url, auth_token)
         try:
             daf_id = daf['id']
+            # NOTE: The API returns preprocessed text, but generate_predictions_for_daf
+            # now expects original text. This will cause incorrect character position mapping.
+            # This function should fetch original text or be updated to handle preprocessed text.
+            text_content = daf['text_content']
             ranges = generate_predictions_for_daf(
                 model, text_content, word_to_idx, label_encoder