Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -937,65 +937,98 @@ setInterval(tick, 500);
|
|
| 937 |
# Create temp directory for files
|
| 938 |
temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
|
| 939 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 940 |
# Extract requested files from dataset rows
|
| 941 |
for file_path in file_names:
|
| 942 |
filename = os.path.basename(file_path)
|
| 943 |
file_found = False
|
| 944 |
|
|
|
|
|
|
|
| 945 |
# Search through dataset rows
|
| 946 |
-
for row in ds:
|
| 947 |
-
#
|
| 948 |
-
|
| 949 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
|
| 951 |
-
# Match by filename
|
| 952 |
-
if
|
| 953 |
temp_file_path = os.path.join(temp_dir, filename)
|
|
|
|
| 954 |
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
try:
|
| 961 |
-
# Check if it's already bytes
|
| 962 |
-
if isinstance(pdf_data, bytes):
|
| 963 |
with open(temp_file_path, 'wb') as f:
|
| 964 |
-
f.write(pdf_data)
|
| 965 |
-
|
| 966 |
-
elif
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
elif 'path' in pdf_data:
|
| 971 |
-
# It's a file path, copy the file
|
| 972 |
-
import shutil
|
| 973 |
-
shutil.copy2(pdf_data['path'], temp_file_path)
|
| 974 |
-
# Try to read as file path string
|
| 975 |
-
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 976 |
-
import shutil
|
| 977 |
-
shutil.copy2(pdf_data, temp_file_path)
|
| 978 |
else:
|
| 979 |
-
logger.error(f"
|
| 980 |
continue
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
logger.error(f"
|
| 990 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
|
| 992 |
if not file_found:
|
| 993 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 994 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 995 |
-
|
| 996 |
-
available = [row.get('pdf', 'N/A') for row in ds]
|
| 997 |
-
logger.debug(f"Available files in dataset: {available}")
|
| 998 |
-
|
| 999 |
if not copied_files:
|
| 1000 |
if len(ds) > 0:
|
| 1001 |
logger.error(f"Dataset structure: {list(ds[0].keys())}")
|
|
@@ -1069,14 +1102,14 @@ setInterval(tick, 500);
|
|
| 1069 |
if is_hf_space:
|
| 1070 |
# Hugging Face Spaces configuration
|
| 1071 |
logger.info("Running on Hugging Face Spaces")
|
| 1072 |
-
demo.launch(theme=gr.themes.Soft(),server_name="0.0.0.0", server_port=7860, css=css, js=js)
|
| 1073 |
else:
|
| 1074 |
# Local development configuration
|
| 1075 |
configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
|
| 1076 |
server_port = _find_open_port(configured_port)
|
| 1077 |
logger.info(f"Launching Gradio on port {server_port}")
|
| 1078 |
logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
|
| 1079 |
-
demo.launch(theme=gr.themes.Soft(),server_port=server_port, share=False, css=css, js=js)
|
| 1080 |
|
| 1081 |
|
| 1082 |
if __name__ == "__main__":
|
|
|
|
| 937 |
# Create temp directory for files
|
| 938 |
temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
|
| 939 |
|
| 940 |
+
# Debug: Log first row structure
|
| 941 |
+
if len(ds) > 0:
|
| 942 |
+
first_row = ds[0]
|
| 943 |
+
pdf_data = first_row.get('pdf', None)
|
| 944 |
+
logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
|
| 945 |
+
if isinstance(pdf_data, dict):
|
| 946 |
+
logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
|
| 947 |
+
if 'path' in pdf_data:
|
| 948 |
+
logger.info(f"PDF path example: {pdf_data.get('path', 'N/A')}")
|
| 949 |
+
|
| 950 |
# Extract requested files from dataset rows
|
| 951 |
for file_path in file_names:
|
| 952 |
filename = os.path.basename(file_path)
|
| 953 |
file_found = False
|
| 954 |
|
| 955 |
+
logger.info(f"Looking for file: {filename}")
|
| 956 |
+
|
| 957 |
# Search through dataset rows
|
| 958 |
+
for row_idx, row in enumerate(ds):
|
| 959 |
+
# The 'pdf' column contains file objects from HF datasets
|
| 960 |
+
pdf_data = row.get('pdf', None)
|
| 961 |
+
|
| 962 |
+
if pdf_data is None:
|
| 963 |
+
continue
|
| 964 |
+
|
| 965 |
+
# Extract the actual filename from the pdf data
|
| 966 |
+
# HF datasets library returns file objects as dicts with 'path' key
|
| 967 |
+
if isinstance(pdf_data, dict):
|
| 968 |
+
row_filename = pdf_data.get('path', '')
|
| 969 |
+
elif isinstance(pdf_data, str):
|
| 970 |
+
row_filename = pdf_data
|
| 971 |
+
else:
|
| 972 |
+
# Try to get path attribute (for other formats)
|
| 973 |
+
row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
|
| 974 |
+
|
| 975 |
+
row_basename = os.path.basename(str(row_filename))
|
| 976 |
+
logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
|
| 977 |
|
| 978 |
+
# Match by filename
|
| 979 |
+
if row_basename == filename:
|
| 980 |
temp_file_path = os.path.join(temp_dir, filename)
|
| 981 |
+
logger.info(f"Found match! Extracting {filename}...")
|
| 982 |
|
| 983 |
+
try:
|
| 984 |
+
# Handle different data formats from HF datasets
|
| 985 |
+
if isinstance(pdf_data, dict):
|
| 986 |
+
if 'bytes' in pdf_data and pdf_data['bytes']:
|
| 987 |
+
# Most common: dict with 'bytes' key
|
|
|
|
|
|
|
|
|
|
| 988 |
with open(temp_file_path, 'wb') as f:
|
| 989 |
+
f.write(pdf_data['bytes'])
|
| 990 |
+
logger.info(f"Wrote {len(pdf_data['bytes'])} bytes to {temp_file_path}")
|
| 991 |
+
elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
|
| 992 |
+
# File path exists on disk (HF caches files)
|
| 993 |
+
shutil.copy2(pdf_data['path'], temp_file_path)
|
| 994 |
+
logger.info(f"Copied from cache: {pdf_data['path']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 995 |
else:
|
| 996 |
+
logger.error(f"Dict has no usable data: {list(pdf_data.keys())}")
|
| 997 |
continue
|
| 998 |
+
elif isinstance(pdf_data, bytes):
|
| 999 |
+
# Direct bytes
|
| 1000 |
+
with open(temp_file_path, 'wb') as f:
|
| 1001 |
+
f.write(pdf_data)
|
| 1002 |
+
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 1003 |
+
# File path string
|
| 1004 |
+
shutil.copy2(pdf_data, temp_file_path)
|
| 1005 |
+
else:
|
| 1006 |
+
logger.error(f"Unknown PDF data type: {type(pdf_data)}")
|
| 1007 |
continue
|
| 1008 |
+
|
| 1009 |
+
copied_files.append(temp_file_path)
|
| 1010 |
+
file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
|
| 1011 |
+
file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
|
| 1012 |
+
file_found = True
|
| 1013 |
+
logger.info(f"✅ Successfully extracted {filename}")
|
| 1014 |
+
break
|
| 1015 |
+
except Exception as ex:
|
| 1016 |
+
logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
|
| 1017 |
+
continue
|
| 1018 |
|
| 1019 |
if not file_found:
|
| 1020 |
+
# Debug: show what's actually in the dataset
|
| 1021 |
+
logger.warning(f"❌ File {filename} not found in dataset rows")
|
| 1022 |
+
for idx, row in enumerate(ds):
|
| 1023 |
+
pdf_data = row.get('pdf', None)
|
| 1024 |
+
if pdf_data:
|
| 1025 |
+
if isinstance(pdf_data, dict):
|
| 1026 |
+
available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
|
| 1027 |
+
else:
|
| 1028 |
+
available_name = str(type(pdf_data))
|
| 1029 |
+
logger.info(f" Available file in row {idx}: '{available_name}'")
|
| 1030 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 1031 |
+
|
|
|
|
|
|
|
|
|
|
| 1032 |
if not copied_files:
|
| 1033 |
if len(ds) > 0:
|
| 1034 |
logger.error(f"Dataset structure: {list(ds[0].keys())}")
|
|
|
|
| 1102 |
if is_hf_space:
|
| 1103 |
# Hugging Face Spaces configuration
|
| 1104 |
logger.info("Running on Hugging Face Spaces")
|
| 1105 |
+
demo.launch(theme=gr.themes.Soft(), server_name="0.0.0.0", server_port=7860, css=css, js=js)
|
| 1106 |
else:
|
| 1107 |
# Local development configuration
|
| 1108 |
configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
|
| 1109 |
server_port = _find_open_port(configured_port)
|
| 1110 |
logger.info(f"Launching Gradio on port {server_port}")
|
| 1111 |
logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
|
| 1112 |
+
demo.launch(theme=gr.themes.Soft(), server_name="127.0.0.1", server_port=server_port, share=False, css=css, js=js)
|
| 1113 |
|
| 1114 |
|
| 1115 |
if __name__ == "__main__":
|