Spaces:

ntdservices
/

anchordashboard

Sleeping

App Files Files Community

ntdservices commited on Aug 2, 2025

Commit

1c74735

verified ·

1 Parent(s): 0ebe26d

Update pdf_utils_finalclean_airmac_final.py

Browse files

Files changed (1) hide show

pdf_utils_finalclean_airmac_final.py +52 -31

pdf_utils_finalclean_airmac_final.py CHANGED Viewed

@@ -10,32 +10,40 @@ def is_artifact_word(word):
 def looks_fully_repeated(word):
     core = re.sub(r"[^\w]", "", word)
-    return bool(re.search(r'(.)\1{2,}', core)) and len(set(re.sub(r'(.)\1+', r'\1', core.lower()))) > 1
 def is_entirely_tripled_letters(word):
     core = re.sub(r"[^\w]", "", word)
-    if len(core) % 3 != 0 or len(core) == 0:
         return False
-    return all(core[i].lower() == core[i+1].lower() == core[i+2].lower()
                for i in range(0, len(core), 3))
 # === Dedup logic ===
 COMMONNESS = 4.5
 DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
-PAIR_RE = re.compile(r"(.)\1", flags=re.I)
 def dedup(word: str) -> str:
-    word = DUP3_RE.sub(lambda m: m.group(1)*2, word)
     out, i = [], 0
     while i < len(word):
-        if i+1 < len(word) and word[i].lower() == word[i+1].lower():
-            keep   = "".join(out) + word[i:i+2] + word[i+2:]
-            single = "".join(out) + word[i]       + word[i+2:]
             if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
                zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
-                out.append(word[i]*2)
             else:
                 out.append(word[i])
             i += 2
@@ -44,7 +52,6 @@ def dedup(word: str) -> str:
             i += 1
     return "".join(out)
 # === Main cleanup ===
 def clean_text(text):
@@ -74,32 +81,50 @@ def clean_text(text):
         normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
         if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
             continue
         words, new_line = line.split(), []
-        for w in words:
-            if is_entirely_tripled_letters(w):
-                continue  # remove junk like ttthhhiiisss
-            elif is_artifact_word(w):
                 new_line.append(w[0])
             elif looks_fully_repeated(w):
                 new_line.append(dedup(w))
             else:
                 new_line.append(w)
         final = " ".join(new_line).strip()
         if final:
             cleaned.append(final)
-    # Remove brackets from each line and drop empty results
     cleaned_no_brackets = []
-    for line in cleaned:
-        line = line.replace("[", "").replace("]", "").strip()
-        if line:  # keep only non-empty lines
-            cleaned_no_brackets.append(line)
     return "\n".join(cleaned_no_brackets)
-# Final cleanup
 def apply_textpy_cleanup(text):
     patterns_to_skip = [
         r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
@@ -124,17 +149,13 @@ def apply_textpy_cleanup(text):
         r"stef w segment begins", r"stef w segment ends"
     ]
-    lines = text.splitlines()
-    cleaned = []
-    for line in lines:
-        if any(re.search(p, line.lower()) for p in patterns_to_skip):
-            continue
-        cleaned.append(line)
     return "\n".join(cleaned)
-# Combine everything
 def pdf_to_final_cleaned_text(pdf_path):
     raw_text = extract_text(pdf_path)
-    stage1 = clean_text(raw_text)
     return apply_textpy_cleanup(stage1)

 def looks_fully_repeated(word):
     core = re.sub(r"[^\w]", "", word)
+    return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1
 def is_entirely_tripled_letters(word):
+    """
+    True iff the word is made only of perfect three-by-three repeats:
+    e.g. 'SSSTTTRRRAAANNNGG' → True,  'SSSTTTAA' → False.
+    """
     core = re.sub(r"[^\w]", "", word)
+    if len(core) == 0 or len(core) % 3:
         return False
+    return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower()
                for i in range(0, len(core), 3))
 # === Dedup logic ===
 COMMONNESS = 4.5
 DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
 def dedup(word: str) -> str:
+    """
+    • Collapse any ≥3-char runs to doubles (aaa → aa).
+    • Decide case-by-case whether a remaining double should stay,
+      using Wordfreq Zipf scores for plausibility.
+    """
+    word = DUP3_RE.sub(lambda m: m.group(1) * 2, word)
     out, i = [], 0
     while i < len(word):
+        if i + 1 < len(word) and word[i].lower() == word[i + 1].lower():
+            keep   = "".join(out) + word[i:i + 2] + word[i + 2:]
+            single = "".join(out) + word[i]       + word[i + 2:]
             if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
                zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
+                out.append(word[i] * 2)
             else:
                 out.append(word[i])
             i += 2
             i += 1
     return "".join(out)
 # === Main cleanup ===
 def clean_text(text):
         normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
         if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
             continue
         words, new_line = line.split(), []
+        i = 0
+        while i < len(words):
+            # ── detect runs of tripled-letter words ──
+            if is_entirely_tripled_letters(words[i]):
+                j = i
+                while j < len(words) and is_entirely_tripled_letters(words[j]):
+                    j += 1
+                run_len = j - i
+                if run_len >= 5:          # ≥3 ⇒ assume SOT header → DROP
+                    i = j
+                    continue
+                else:                     # 1- or 2-word bold span → keep, dedup
+                    for k in range(i, j):
+                        new_line.append(dedup(words[k]))
+                    i = j
+                    continue
+            # ── normal per-word cleanup ──
+            w = words[i]
+            if is_artifact_word(w):
                 new_line.append(w[0])
             elif looks_fully_repeated(w):
                 new_line.append(dedup(w))
             else:
                 new_line.append(w)
+            i += 1
         final = " ".join(new_line).strip()
         if final:
             cleaned.append(final)
+    # Remove stray brackets and blank lines
     cleaned_no_brackets = []
+    for ln in cleaned:
+        ln = ln.replace("[", "").replace("]", "").strip()
+        if ln:
+            cleaned_no_brackets.append(ln)
     return "\n".join(cleaned_no_brackets)
+# === Final cleanup ===
 def apply_textpy_cleanup(text):
     patterns_to_skip = [
         r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
         r"stef w segment begins", r"stef w segment ends"
     ]
+    cleaned = [ln for ln in text.splitlines()
+               if not any(re.search(p, ln.lower()) for p in patterns_to_skip)]
     return "\n".join(cleaned)
+# === Glue function ===
 def pdf_to_final_cleaned_text(pdf_path):
     raw_text = extract_text(pdf_path)
+    stage1   = clean_text(raw_text)
     return apply_textpy_cleanup(stage1)