Spaces:
Sleeping
Sleeping
FauziIsyrinApridal commited on
Commit ·
174c308
1
Parent(s): 7add442
revisi 12
Browse files
scrapping/utils/supabase_utils.py
CHANGED
|
@@ -19,14 +19,27 @@ def _list_names(supabase, bucket: str) -> List[str]:
|
|
| 19 |
|
| 20 |
|
| 21 |
def _extract_prefix_and_match_pattern(filename: str):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
return prefix, pattern
|
| 31 |
|
| 32 |
|
|
@@ -36,8 +49,15 @@ def _pick_latest_name(names: List[str], pattern: str) -> Optional[str]:
|
|
| 36 |
return None
|
| 37 |
|
| 38 |
def ts_key(name: str):
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
matched.sort(key=ts_key, reverse=True)
|
| 43 |
return matched[0]
|
|
@@ -53,6 +73,38 @@ def _download_text(supabase, bucket: str, name: str) -> Optional[str]:
|
|
| 53 |
return None
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
|
| 57 |
"""Upload file only if content differs from latest existing file with the same prefix.
|
| 58 |
|
|
@@ -67,18 +119,39 @@ def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str,
|
|
| 67 |
payload = content.encode('utf-8')
|
| 68 |
|
| 69 |
prefix, pattern = _extract_prefix_and_match_pattern(filename)
|
|
|
|
|
|
|
|
|
|
| 70 |
names = _list_names(supabase, bucket)
|
|
|
|
|
|
|
| 71 |
latest = _pick_latest_name(names, pattern)
|
| 72 |
if latest:
|
|
|
|
| 73 |
old_text = _download_text(supabase, bucket, latest)
|
| 74 |
-
if old_text is not None
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
supabase.storage.from_(bucket).upload(
|
| 78 |
path=filename,
|
| 79 |
file=payload,
|
| 80 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
| 81 |
)
|
|
|
|
| 82 |
return {"result": "uploaded"}
|
| 83 |
except Exception as e:
|
|
|
|
| 84 |
return {"result": "error", "error": str(e)}
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def _extract_prefix_and_match_pattern(filename: str):
|
| 22 |
+
"""Extract prefix and pattern supporting _YYYYMMDD_HHMM or _YYYYMMDD_HHMMSS before extension.
|
| 23 |
+
|
| 24 |
+
Returns (prefix, regex_pattern)
|
| 25 |
+
"""
|
| 26 |
+
# Strip extension
|
| 27 |
+
base, _, ext = filename.rpartition('.')
|
| 28 |
+
if not base:
|
| 29 |
+
base = filename
|
| 30 |
+
ext = ''
|
| 31 |
+
|
| 32 |
+
# Match last occurrence of timestamp suffix
|
| 33 |
+
m = re.match(r"^(.*)_(\d{8}_\d{4,6})$", base)
|
| 34 |
+
if m:
|
| 35 |
+
prefix = m.group(1)
|
| 36 |
+
else:
|
| 37 |
+
# No recognizable timestamp; treat entire base as prefix
|
| 38 |
+
prefix = base
|
| 39 |
+
|
| 40 |
+
# Build a pattern that accepts either HHMM or HHMMSS
|
| 41 |
+
ext_pattern = re.escape('.' + ext) if ext else r"\.txt"
|
| 42 |
+
pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{4,6}}{ext_pattern}$"
|
| 43 |
return prefix, pattern
|
| 44 |
|
| 45 |
|
|
|
|
| 49 |
return None
|
| 50 |
|
| 51 |
def ts_key(name: str):
|
| 52 |
+
# extract timestamp allowing HHMM or HHMMSS and normalize to HHMMSS for comparison
|
| 53 |
+
m = re.search(r"_(\d{8})_(\d{4,6})\.[^.]+$", name)
|
| 54 |
+
if not m:
|
| 55 |
+
return "00000000_000000"
|
| 56 |
+
date = m.group(1)
|
| 57 |
+
time = m.group(2)
|
| 58 |
+
if len(time) == 4:
|
| 59 |
+
time = time + "00"
|
| 60 |
+
return f"{date}_{time}"
|
| 61 |
|
| 62 |
matched.sort(key=ts_key, reverse=True)
|
| 63 |
return matched[0]
|
|
|
|
| 73 |
return None
|
| 74 |
|
| 75 |
|
| 76 |
+
def _normalize_text(text: str) -> str:
|
| 77 |
+
"""Normalize text for comparison by removing volatile timestamp lines and trimming whitespace.
|
| 78 |
+
|
| 79 |
+
- Removes lines starting with 'Diperbarui pada:' or 'Tanggal Akses:' (common dynamic timestamps)
|
| 80 |
+
- Strips trailing spaces on each line
|
| 81 |
+
- Collapses multiple blank lines into a single blank line
|
| 82 |
+
- Trims leading/trailing whitespace overall
|
| 83 |
+
"""
|
| 84 |
+
# Remove BOM if present
|
| 85 |
+
if text and text.startswith("\ufeff"):
|
| 86 |
+
text = text.lstrip("\ufeff")
|
| 87 |
+
|
| 88 |
+
lines = []
|
| 89 |
+
for line in text.splitlines():
|
| 90 |
+
lstrip = line.lstrip()
|
| 91 |
+
if lstrip.startswith("Diperbarui pada:") or lstrip.startswith("Tanggal Akses:"):
|
| 92 |
+
continue
|
| 93 |
+
lines.append(line.rstrip())
|
| 94 |
+
|
| 95 |
+
# Collapse multiple blank lines
|
| 96 |
+
collapsed = []
|
| 97 |
+
last_blank = False
|
| 98 |
+
for ln in lines:
|
| 99 |
+
is_blank = (ln.strip() == "")
|
| 100 |
+
if is_blank and last_blank:
|
| 101 |
+
continue
|
| 102 |
+
collapsed.append(ln)
|
| 103 |
+
last_blank = is_blank
|
| 104 |
+
|
| 105 |
+
return "\n".join(collapsed).strip()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
|
| 109 |
"""Upload file only if content differs from latest existing file with the same prefix.
|
| 110 |
|
|
|
|
| 119 |
payload = content.encode('utf-8')
|
| 120 |
|
| 121 |
prefix, pattern = _extract_prefix_and_match_pattern(filename)
|
| 122 |
+
print(f"[DEDUP] Checking file: {filename}")
|
| 123 |
+
print(f"[DEDUP] Extracted prefix: '{prefix}', pattern: '{pattern}'")
|
| 124 |
+
|
| 125 |
names = _list_names(supabase, bucket)
|
| 126 |
+
print(f"[DEDUP] Found {len(names)} total files in bucket")
|
| 127 |
+
|
| 128 |
latest = _pick_latest_name(names, pattern)
|
| 129 |
if latest:
|
| 130 |
+
print(f"[DEDUP] Latest existing file with same prefix: {latest}")
|
| 131 |
old_text = _download_text(supabase, bucket, latest)
|
| 132 |
+
if old_text is not None:
|
| 133 |
+
old_normalized = _normalize_text(old_text)
|
| 134 |
+
new_normalized = _normalize_text(text)
|
| 135 |
+
print(f"[DEDUP] Old content length (normalized): {len(old_normalized)} chars")
|
| 136 |
+
print(f"[DEDUP] New content length (normalized): {len(new_normalized)} chars")
|
| 137 |
+
|
| 138 |
+
if old_normalized == new_normalized:
|
| 139 |
+
print(f"[DEDUP] ✅ Content identical - SKIPPING upload")
|
| 140 |
+
return {"result": "skipped"}
|
| 141 |
+
else:
|
| 142 |
+
print(f"[DEDUP] ❌ Content differs - PROCEEDING with upload")
|
| 143 |
+
else:
|
| 144 |
+
print(f"[DEDUP] ⚠️ Could not download existing file content - PROCEEDING with upload")
|
| 145 |
+
else:
|
| 146 |
+
print(f"[DEDUP] No existing file with same prefix found - PROCEEDING with upload")
|
| 147 |
|
| 148 |
supabase.storage.from_(bucket).upload(
|
| 149 |
path=filename,
|
| 150 |
file=payload,
|
| 151 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
| 152 |
)
|
| 153 |
+
print(f"[DEDUP] ✅ Successfully uploaded: {filename}")
|
| 154 |
return {"result": "uploaded"}
|
| 155 |
except Exception as e:
|
| 156 |
+
print(f"[DEDUP] ❌ Error during upload: {str(e)}")
|
| 157 |
return {"result": "error", "error": str(e)}
|