Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on Aug 17, 2025

Commit

174c308

1 Parent(s): 7add442

revisi 12

Browse files

Files changed (1) hide show

scrapping/utils/supabase_utils.py +85 -12

scrapping/utils/supabase_utils.py CHANGED Viewed

@@ -19,14 +19,27 @@ def _list_names(supabase, bucket: str) -> List[str]:
 def _extract_prefix_and_match_pattern(filename: str):
-    # Expect filenames like: <prefix>_YYYYMMDD_HHMMSS.txt
-    m = re.match(r"^(.*)_(\d{8}_\d{6})\.txt$", filename)
-    if not m:
-        # fallback: treat entire name (without extension) as prefix
-        base = filename.rsplit('.', 1)[0]
-        return base, rf"^{re.escape(base)}_\d{{8}}_\d{{6}}\.txt$"
-    prefix = m.group(1)
-    pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{6}}\.txt$"
     return prefix, pattern
@@ -36,8 +49,15 @@ def _pick_latest_name(names: List[str], pattern: str) -> Optional[str]:
         return None
     def ts_key(name: str):
-        m = re.search(r"_(\d{8}_\d{6})\.txt$", name)
-        return m.group(1) if m else "00000000_000000"
     matched.sort(key=ts_key, reverse=True)
     return matched[0]
@@ -53,6 +73,38 @@ def _download_text(supabase, bucket: str, name: str) -> Optional[str]:
         return None
 def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
     """Upload file only if content differs from latest existing file with the same prefix.
@@ -67,18 +119,39 @@ def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str,
             payload = content.encode('utf-8')
         prefix, pattern = _extract_prefix_and_match_pattern(filename)
         names = _list_names(supabase, bucket)
         latest = _pick_latest_name(names, pattern)
         if latest:
             old_text = _download_text(supabase, bucket, latest)
-            if old_text is not None and old_text == text:
-                return {"result": "skipped"}
         supabase.storage.from_(bucket).upload(
             path=filename,
             file=payload,
             file_options={"content-type": "text/plain; charset=utf-8"}
         )
         return {"result": "uploaded"}
     except Exception as e:
         return {"result": "error", "error": str(e)}

 def _extract_prefix_and_match_pattern(filename: str):
+    """Extract prefix and pattern supporting _YYYYMMDD_HHMM or _YYYYMMDD_HHMMSS before extension.
+    Returns (prefix, regex_pattern)
+    """
+    # Strip extension
+    base, _, ext = filename.rpartition('.')
+    if not base:
+        base = filename
+        ext = ''
+    # Match last occurrence of timestamp suffix
+    m = re.match(r"^(.*)_(\d{8}_\d{4,6})$", base)
+    if m:
+        prefix = m.group(1)
+    else:
+        # No recognizable timestamp; treat entire base as prefix
+        prefix = base
+    # Build a pattern that accepts either HHMM or HHMMSS
+    ext_pattern = re.escape('.' + ext) if ext else r"\.txt"
+    pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{4,6}}{ext_pattern}$"
     return prefix, pattern
         return None
     def ts_key(name: str):
+        # extract timestamp allowing HHMM or HHMMSS and normalize to HHMMSS for comparison
+        m = re.search(r"_(\d{8})_(\d{4,6})\.[^.]+$", name)
+        if not m:
+            return "00000000_000000"
+        date = m.group(1)
+        time = m.group(2)
+        if len(time) == 4:
+            time = time + "00"
+        return f"{date}_{time}"
     matched.sort(key=ts_key, reverse=True)
     return matched[0]
         return None
+def _normalize_text(text: str) -> str:
+    """Normalize text for comparison by removing volatile timestamp lines and trimming whitespace.
+    - Removes lines starting with 'Diperbarui pada:' or 'Tanggal Akses:' (common dynamic timestamps)
+    - Strips trailing spaces on each line
+    - Collapses multiple blank lines into a single blank line
+    - Trims leading/trailing whitespace overall
+    """
+    # Remove BOM if present
+    if text and text.startswith("\ufeff"):
+        text = text.lstrip("\ufeff")
+    lines = []
+    for line in text.splitlines():
+        lstrip = line.lstrip()
+        if lstrip.startswith("Diperbarui pada:") or lstrip.startswith("Tanggal Akses:"):
+            continue
+        lines.append(line.rstrip())
+    # Collapse multiple blank lines
+    collapsed = []
+    last_blank = False
+    for ln in lines:
+        is_blank = (ln.strip() == "")
+        if is_blank and last_blank:
+            continue
+        collapsed.append(ln)
+        last_blank = is_blank
+    return "\n".join(collapsed).strip()
 def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
     """Upload file only if content differs from latest existing file with the same prefix.
             payload = content.encode('utf-8')
         prefix, pattern = _extract_prefix_and_match_pattern(filename)
+        print(f"[DEDUP] Checking file: {filename}")
+        print(f"[DEDUP] Extracted prefix: '{prefix}', pattern: '{pattern}'")
         names = _list_names(supabase, bucket)
+        print(f"[DEDUP] Found {len(names)} total files in bucket")
         latest = _pick_latest_name(names, pattern)
         if latest:
+            print(f"[DEDUP] Latest existing file with same prefix: {latest}")
             old_text = _download_text(supabase, bucket, latest)
+            if old_text is not None:
+                old_normalized = _normalize_text(old_text)
+                new_normalized = _normalize_text(text)
+                print(f"[DEDUP] Old content length (normalized): {len(old_normalized)} chars")
+                print(f"[DEDUP] New content length (normalized): {len(new_normalized)} chars")
+                if old_normalized == new_normalized:
+                    print(f"[DEDUP] ✅ Content identical - SKIPPING upload")
+                    return {"result": "skipped"}
+                else:
+                    print(f"[DEDUP] ❌ Content differs - PROCEEDING with upload")
+            else:
+                print(f"[DEDUP] ⚠️ Could not download existing file content - PROCEEDING with upload")
+        else:
+            print(f"[DEDUP] No existing file with same prefix found - PROCEEDING with upload")
         supabase.storage.from_(bucket).upload(
             path=filename,
             file=payload,
             file_options={"content-type": "text/plain; charset=utf-8"}
         )
+        print(f"[DEDUP] ✅ Successfully uploaded: {filename}")
         return {"result": "uploaded"}
     except Exception as e:
+        print(f"[DEDUP] ❌ Error during upload: {str(e)}")
         return {"result": "error", "error": str(e)}