Spaces:
Sleeping
Sleeping
Zeggai Abdellah
commited on
Commit
·
4a8836b
1
Parent(s):
8517947
update the quation to work with all files
Browse files- app.py +28 -9
- chunk/chunks_WER8624_241_unstructured_api_output.json +0 -0
- chunk/chunks_WER8647_521_unstructured_api_output.json +0 -0
- chunk/chunks_WER8827_269_unstructured_api_output.json +0 -0
- chunk/chunks_WER8839_413_unstructured_api_output.json +0 -0
- chunk/chunks_WER8925_unstructured_api_output.json +0 -0
- chunk/chunks_WER9008_57_unstructured_api_output.json +0 -0
- chunk/chunks_WER9009_69_unstructured_api_output.json +0 -0
- chunk/chunks_WER9018_185_unstructured_api_output.json +0 -0
- chunk/chunks_WER9035_unstructured_api_output.json +0 -0
- chunk/chunks_WER9039_505_unstructured_api_output.json +0 -0
- chunk/chunks_WER9206_unstructured_api_output.json +0 -0
- chunk/chunks_WER9217_unstructured_api_output.json +0 -0
- chunk/chunks_WER9225_unstructured_api_output.json +0 -0
- chunk/chunks_WER9227_unstructured_api_output.json +0 -0
- chunk/chunks_WER9231_unstructured_api_output.json +0 -0
- chunk/chunks_WER9234_unstructured_api_output.json +0 -0
- chunk/chunks_WER9308_unstructured_api_output.json +0 -0
- chunk/chunks_WER9313_unstructured_api_output.json +0 -0
- chunk/chunks_WER9316_unstructured_api_output.json +0 -0
- chunk/chunks_WER9527_unstructured_api_output.json +0 -0
- chunk/chunks_WER9623_unstructured_api_output.json +0 -0
- chunk/chunks_WER9719_unstructured_api_output.json +0 -0
- chunk/chunks_WER9720_unstructured_api_output.json +0 -0
- chunk/chunks_WER9725_unstructured_api_output.json +0 -0
- chunk/chunks_WER9740_unstructured_api_output.json +0 -0
- chunk/chunks_WER9750_unstructured_api_output.json +0 -0
- chunk/chunks_WER9901_02_unstructured_api_output.json +0 -0
- chunk/chunks_With_bboxes_elements_Immunization_unstructured_api_output_mr_.json +0 -0
- chunks.json → chunk/chunks_filtered_elements_with_bboxes.json +0 -0
app.py
CHANGED
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
| 9 |
from huggingface_hub import HfApi # For file persistence in Spaces
|
| 10 |
import os
|
| 11 |
import threading
|
|
|
|
| 12 |
|
| 13 |
# Load environment variables from .env file
|
| 14 |
load_dotenv()
|
|
@@ -195,7 +196,7 @@ def save_dataset_to_space(dataset: Dict, filename: str):
|
|
| 195 |
@app.get("/generate-questions")
|
| 196 |
async def generate_questions():
|
| 197 |
"""
|
| 198 |
-
Endpoint to generate questions from the
|
| 199 |
"""
|
| 200 |
global generation_status
|
| 201 |
|
|
@@ -219,23 +220,41 @@ async def generate_questions():
|
|
| 219 |
generation_status["result_file"] = None
|
| 220 |
generation_status["error"] = None
|
| 221 |
|
| 222 |
-
# Load
|
| 223 |
-
|
| 224 |
-
chunks_data = json.load(f)
|
| 225 |
|
| 226 |
-
if
|
| 227 |
-
raise HTTPException(status_code=404, detail="
|
| 228 |
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
# Start generation in background thread
|
| 232 |
-
thread = threading.Thread(target=generate_questions_in_background, args=(
|
| 233 |
thread.daemon = True
|
| 234 |
thread.start()
|
| 235 |
|
| 236 |
return {
|
| 237 |
"status": "started",
|
| 238 |
-
"message": "Question generation started
|
| 239 |
"current_status": generation_status
|
| 240 |
}
|
| 241 |
except Exception as e:
|
|
|
|
| 9 |
from huggingface_hub import HfApi # For file persistence in Spaces
|
| 10 |
import os
|
| 11 |
import threading
|
| 12 |
+
import glob
|
| 13 |
|
| 14 |
# Load environment variables from .env file
|
| 15 |
load_dotenv()
|
|
|
|
| 196 |
@app.get("/generate-questions")
|
| 197 |
async def generate_questions():
|
| 198 |
"""
|
| 199 |
+
Endpoint to generate questions from all JSON files in the data folder
|
| 200 |
"""
|
| 201 |
global generation_status
|
| 202 |
|
|
|
|
| 220 |
generation_status["result_file"] = None
|
| 221 |
generation_status["error"] = None
|
| 222 |
|
| 223 |
+
# Load all JSON files from data folder
|
| 224 |
+
json_files = glob.glob("./data/*.json")
|
|
|
|
| 225 |
|
| 226 |
+
if not json_files:
|
| 227 |
+
raise HTTPException(status_code=404, detail="No JSON files found in data folder")
|
| 228 |
|
| 229 |
+
all_chunks = []
|
| 230 |
+
for json_file in json_files:
|
| 231 |
+
with open(json_file, "r", encoding="utf-8") as f:
|
| 232 |
+
chunks_data = json.load(f)
|
| 233 |
+
if isinstance(chunks_data, list):
|
| 234 |
+
# If it's a list of chunks
|
| 235 |
+
for chunk in chunks_data:
|
| 236 |
+
if isinstance(chunk, dict) and "text" in chunk:
|
| 237 |
+
all_chunks.append(chunk["text"])
|
| 238 |
+
elif isinstance(chunk, str):
|
| 239 |
+
all_chunks.append(chunk)
|
| 240 |
+
elif isinstance(chunks_data, dict):
|
| 241 |
+
# If it's a dict, try to extract text content
|
| 242 |
+
if "text" in chunks_data:
|
| 243 |
+
all_chunks.append(chunks_data["text"])
|
| 244 |
+
elif "content" in chunks_data:
|
| 245 |
+
all_chunks.append(chunks_data["content"])
|
| 246 |
+
|
| 247 |
+
if not all_chunks:
|
| 248 |
+
raise HTTPException(status_code=404, detail="No text content found in JSON files")
|
| 249 |
|
| 250 |
# Start generation in background thread
|
| 251 |
+
thread = threading.Thread(target=generate_questions_in_background, args=(all_chunks,))
|
| 252 |
thread.daemon = True
|
| 253 |
thread.start()
|
| 254 |
|
| 255 |
return {
|
| 256 |
"status": "started",
|
| 257 |
+
"message": f"Question generation started for {len(json_files)} JSON files with {len(all_chunks)} chunks",
|
| 258 |
"current_status": generation_status
|
| 259 |
}
|
| 260 |
except Exception as e:
|
chunk/chunks_WER8624_241_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER8647_521_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER8827_269_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER8839_413_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER8925_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9008_57_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9009_69_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9018_185_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9035_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9039_505_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9206_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9217_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9225_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9227_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9231_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9234_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9308_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9313_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9316_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9527_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9623_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9719_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9720_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9725_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9740_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9750_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_WER9901_02_unstructured_api_output.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunk/chunks_With_bboxes_elements_Immunization_unstructured_api_output_mr_.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chunks.json → chunk/chunks_filtered_elements_with_bboxes.json
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|