Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import time
|
|
@@ -116,7 +117,7 @@ def list_rar_files_in_dataset() -> list:
|
|
| 116 |
rar_files = [
|
| 117 |
f for f in all_files
|
| 118 |
if f.startswith(DATA_PATH)
|
| 119 |
-
and (f.lower().endswith(
|
| 120 |
]
|
| 121 |
|
| 122 |
logging.info(f"[*] Found {len(rar_files)} RAR files")
|
|
@@ -128,6 +129,32 @@ def list_rar_files_in_dataset() -> list:
|
|
| 128 |
logging.error(f"[!] Failed to list files: {e}")
|
| 129 |
return []
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
# === Extract and Upload RAR ===
|
| 132 |
async def extract_and_upload_rar(rar_file_path: str):
|
| 133 |
try:
|
|
@@ -173,13 +200,22 @@ async def rar_processor_worker():
|
|
| 173 |
try:
|
| 174 |
logging.info("[*] Scanning for RAR files...")
|
| 175 |
rar_files = list_rar_files_in_dataset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
if
|
| 178 |
-
|
|
|
|
|
|
|
| 179 |
await extract_and_upload_rar(rar_file)
|
| 180 |
await asyncio.sleep(5) # Delay between files
|
| 181 |
else:
|
| 182 |
-
logging.info("[*] No RAR files found, waiting...")
|
| 183 |
|
| 184 |
# Wait 60 seconds before next scan
|
| 185 |
await asyncio.sleep(60)
|
|
@@ -210,20 +246,35 @@ def health():
|
|
| 210 |
|
| 211 |
@app.get("/scan")
|
| 212 |
def scan_rars():
|
| 213 |
-
"""Manually trigger RAR file scan"""
|
| 214 |
rar_files = list_rar_files_in_dataset()
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
@app.post("/extract-all")
|
| 218 |
async def extract_all():
|
| 219 |
-
"""Manually trigger extraction of all RAR files"""
|
| 220 |
rar_files = list_rar_files_in_dataset()
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
results = []
|
| 226 |
-
for rar_file in
|
| 227 |
success = await extract_and_upload_rar(rar_file)
|
| 228 |
results.append({"file": rar_file, "success": success})
|
| 229 |
await asyncio.sleep(5)
|
|
|
|
| 1 |
+
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import time
|
|
|
|
| 117 |
rar_files = [
|
| 118 |
f for f in all_files
|
| 119 |
if f.startswith(DATA_PATH)
|
| 120 |
+
and (f.lower().endswith(".rar") or re.search(r'\.r\d{2}$', f, re.IGNORECASE))
|
| 121 |
]
|
| 122 |
|
| 123 |
logging.info(f"[*] Found {len(rar_files)} RAR files")
|
|
|
|
| 129 |
logging.error(f"[!] Failed to list files: {e}")
|
| 130 |
return []
|
| 131 |
|
| 132 |
+
# === List Extracted Courses in Dataset ===
|
| 133 |
+
def list_extracted_courses_in_dataset() -> set:
|
| 134 |
+
try:
|
| 135 |
+
logging.info(f"[*] Scanning dataset for extracted courses in {EXTRACTED_PATH}")
|
| 136 |
+
all_files = list_repo_files(
|
| 137 |
+
repo_id=REPO_ID,
|
| 138 |
+
repo_type="dataset",
|
| 139 |
+
token=HF_TOKEN
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
extracted_courses = set()
|
| 143 |
+
for f in all_files:
|
| 144 |
+
if f.startswith(EXTRACTED_PATH + '/') and len(f.split('/')) > len(EXTRACTED_PATH.split('/')):
|
| 145 |
+
# Extract the course name from the path
|
| 146 |
+
course_name = f.split('/')[len(EXTRACTED_PATH.split('/'))]
|
| 147 |
+
extracted_courses.add(course_name)
|
| 148 |
+
|
| 149 |
+
logging.info(f"[*] Found {len(extracted_courses)} extracted courses")
|
| 150 |
+
for ec in extracted_courses:
|
| 151 |
+
logging.info(f" - {ec}")
|
| 152 |
+
|
| 153 |
+
return extracted_courses
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logging.error(f"[!] Failed to list extracted courses: {e}")
|
| 156 |
+
return set()
|
| 157 |
+
|
| 158 |
# === Extract and Upload RAR ===
|
| 159 |
async def extract_and_upload_rar(rar_file_path: str):
|
| 160 |
try:
|
|
|
|
| 200 |
try:
|
| 201 |
logging.info("[*] Scanning for RAR files...")
|
| 202 |
rar_files = list_rar_files_in_dataset()
|
| 203 |
+
extracted_courses = list_extracted_courses_in_dataset()
|
| 204 |
+
|
| 205 |
+
untouched_rar_files = []
|
| 206 |
+
for rar_file in rar_files:
|
| 207 |
+
course_name = extract_course_name(os.path.basename(rar_file))
|
| 208 |
+
if course_name not in extracted_courses:
|
| 209 |
+
untouched_rar_files.append(rar_file)
|
| 210 |
|
| 211 |
+
if untouched_rar_files:
|
| 212 |
+
logging.info(f"[*] Found {len(untouched_rar_files)} untouched RAR files.")
|
| 213 |
+
for rar_file in untouched_rar_files:
|
| 214 |
+
logging.info(f" - {rar_file}")
|
| 215 |
await extract_and_upload_rar(rar_file)
|
| 216 |
await asyncio.sleep(5) # Delay between files
|
| 217 |
else:
|
| 218 |
+
logging.info("[*] No untouched RAR files found, waiting...")
|
| 219 |
|
| 220 |
# Wait 60 seconds before next scan
|
| 221 |
await asyncio.sleep(60)
|
|
|
|
| 246 |
|
| 247 |
@app.get("/scan")
|
| 248 |
def scan_rars():
|
| 249 |
+
"""Manually trigger RAR file scan and identify untouched ones"""
|
| 250 |
rar_files = list_rar_files_in_dataset()
|
| 251 |
+
extracted_courses = list_extracted_courses_in_dataset()
|
| 252 |
+
|
| 253 |
+
untouched_rar_files = []
|
| 254 |
+
for rar_file in rar_files:
|
| 255 |
+
course_name = extract_course_name(os.path.basename(rar_file))
|
| 256 |
+
if course_name not in extracted_courses:
|
| 257 |
+
untouched_rar_files.append(rar_file)
|
| 258 |
+
|
| 259 |
+
return {"total_rar_files": len(rar_files), "extracted_courses": len(extracted_courses), "untouched_rar_files": untouched_rar_files}
|
| 260 |
|
| 261 |
@app.post("/extract-all")
|
| 262 |
async def extract_all():
|
| 263 |
+
"""Manually trigger extraction of all untouched RAR files"""
|
| 264 |
rar_files = list_rar_files_in_dataset()
|
| 265 |
+
extracted_courses = list_extracted_courses_in_dataset()
|
| 266 |
|
| 267 |
+
untouched_rar_files = []
|
| 268 |
+
for rar_file in rar_files:
|
| 269 |
+
course_name = extract_course_name(os.path.basename(rar_file))
|
| 270 |
+
if course_name not in extracted_courses:
|
| 271 |
+
untouched_rar_files.append(rar_file)
|
| 272 |
+
|
| 273 |
+
if not untouched_rar_files:
|
| 274 |
+
return {"message": "No untouched RAR files found to extract"}
|
| 275 |
|
| 276 |
results = []
|
| 277 |
+
for rar_file in untouched_rar_files:
|
| 278 |
success = await extract_and_upload_rar(rar_file)
|
| 279 |
results.append({"file": rar_file, "success": success})
|
| 280 |
await asyncio.sleep(5)
|