Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -907,9 +907,23 @@ setInterval(tick, 500);
|
|
| 907 |
copied_files = []
|
| 908 |
file_info_text = f"✅ Loaded: {example_key}\n\n"
|
| 909 |
|
| 910 |
-
# Get HF token
|
| 911 |
hf_token = os.environ.get("HF_TOKEN", None)
|
| 912 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
try:
|
| 914 |
# Load dataset - uses row-based structure
|
| 915 |
logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
|
|
@@ -934,16 +948,14 @@ setInterval(tick, 500);
|
|
| 934 |
# Adjust field names based on your dataset structure
|
| 935 |
row_filename = row.get('filename') or row.get('name') or row.get('path', '')
|
| 936 |
|
| 937 |
-
if os.path.basename(row_filename) == filename:
|
| 938 |
temp_file_path = os.path.join(temp_dir, filename)
|
| 939 |
|
| 940 |
# Handle different dataset column formats
|
| 941 |
if 'content' in row and row['content']:
|
| 942 |
-
# Binary content stored directly
|
| 943 |
with open(temp_file_path, 'wb') as f:
|
| 944 |
f.write(row['content'])
|
| 945 |
elif 'file' in row and row['file']:
|
| 946 |
-
# File object with bytes
|
| 947 |
file_obj = row['file']
|
| 948 |
if isinstance(file_obj, dict) and 'bytes' in file_obj:
|
| 949 |
with open(temp_file_path, 'wb') as f:
|
|
@@ -952,11 +964,10 @@ setInterval(tick, 500);
|
|
| 952 |
with open(temp_file_path, 'wb') as f:
|
| 953 |
f.write(file_obj)
|
| 954 |
elif 'data' in row and row['data']:
|
| 955 |
-
# Raw data field
|
| 956 |
with open(temp_file_path, 'wb') as f:
|
| 957 |
f.write(row['data'])
|
| 958 |
else:
|
| 959 |
-
logger.warning(f"Unknown dataset format for {filename},
|
| 960 |
continue
|
| 961 |
|
| 962 |
copied_files.append(temp_file_path)
|
|
@@ -971,16 +982,30 @@ setInterval(tick, 500);
|
|
| 971 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 972 |
|
| 973 |
if not copied_files:
|
| 974 |
-
# Log dataset structure for debugging
|
| 975 |
if len(ds) > 0:
|
| 976 |
logger.error(f"Dataset structure: {list(ds[0].keys())}")
|
| 977 |
-
return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please
|
| 978 |
|
| 979 |
return copied_files, question_text, file_info_text
|
| 980 |
|
| 981 |
except Exception as e:
|
|
|
|
| 982 |
logger.error(f"Failed to load dataset: {e}", exc_info=True)
|
| 983 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 984 |
|
| 985 |
except ImportError as e:
|
| 986 |
logger.error(f"datasets package not installed: {e}")
|
|
|
|
| 907 |
copied_files = []
|
| 908 |
file_info_text = f"✅ Loaded: {example_key}\n\n"
|
| 909 |
|
| 910 |
+
# Get HF token - REQUIRED for gated datasets
|
| 911 |
hf_token = os.environ.get("HF_TOKEN", None)
|
| 912 |
|
| 913 |
+
if not hf_token:
|
| 914 |
+
logger.warning("HF_TOKEN not set - required for gated datasets")
|
| 915 |
+
return [], "", (
|
| 916 |
+
"❌ **Authentication Required**\n\n"
|
| 917 |
+
"The example dataset is gated and requires authentication.\n\n"
|
| 918 |
+
"**To fix:**\n"
|
| 919 |
+
"1. Go to Space Settings → Repository secrets\n"
|
| 920 |
+
"2. Add secret: `HF_TOKEN` = your Hugging Face token\n"
|
| 921 |
+
"3. Restart the Space\n\n"
|
| 922 |
+
"Or make your dataset public at:\n"
|
| 923 |
+
"https://huggingface.co/datasets/TilanB/smartdoc-samples/settings\n\n"
|
| 924 |
+
"For now, please **upload files manually**."
|
| 925 |
+
)
|
| 926 |
+
|
| 927 |
try:
|
| 928 |
# Load dataset - uses row-based structure
|
| 929 |
logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
|
|
|
|
| 948 |
# Adjust field names based on your dataset structure
|
| 949 |
row_filename = row.get('filename') or row.get('name') or row.get('path', '')
|
| 950 |
|
| 951 |
+
if os.path.basename(str(row_filename)) == filename:
|
| 952 |
temp_file_path = os.path.join(temp_dir, filename)
|
| 953 |
|
| 954 |
# Handle different dataset column formats
|
| 955 |
if 'content' in row and row['content']:
|
|
|
|
| 956 |
with open(temp_file_path, 'wb') as f:
|
| 957 |
f.write(row['content'])
|
| 958 |
elif 'file' in row and row['file']:
|
|
|
|
| 959 |
file_obj = row['file']
|
| 960 |
if isinstance(file_obj, dict) and 'bytes' in file_obj:
|
| 961 |
with open(temp_file_path, 'wb') as f:
|
|
|
|
| 964 |
with open(temp_file_path, 'wb') as f:
|
| 965 |
f.write(file_obj)
|
| 966 |
elif 'data' in row and row['data']:
|
|
|
|
| 967 |
with open(temp_file_path, 'wb') as f:
|
| 968 |
f.write(row['data'])
|
| 969 |
else:
|
| 970 |
+
logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
|
| 971 |
continue
|
| 972 |
|
| 973 |
copied_files.append(temp_file_path)
|
|
|
|
| 982 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 983 |
|
| 984 |
if not copied_files:
|
|
|
|
| 985 |
if len(ds) > 0:
|
| 986 |
logger.error(f"Dataset structure: {list(ds[0].keys())}")
|
| 987 |
+
return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please upload files manually."
|
| 988 |
|
| 989 |
return copied_files, question_text, file_info_text
|
| 990 |
|
| 991 |
except Exception as e:
|
| 992 |
+
error_msg = str(e)
|
| 993 |
logger.error(f"Failed to load dataset: {e}", exc_info=True)
|
| 994 |
+
|
| 995 |
+
# Check for gated dataset error
|
| 996 |
+
if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
|
| 997 |
+
return [], "", (
|
| 998 |
+
"❌ **Dataset Access Denied**\n\n"
|
| 999 |
+
"The dataset is gated and your token doesn't have access.\n\n"
|
| 1000 |
+
"**To fix:**\n"
|
| 1001 |
+
"1. Visit: https://huggingface.co/datasets/TilanB/smartdoc-samples\n"
|
| 1002 |
+
"2. Accept the access terms (if any)\n"
|
| 1003 |
+
"3. Make sure HF_TOKEN is set in Space secrets\n\n"
|
| 1004 |
+
"Or make your dataset public.\n\n"
|
| 1005 |
+
"For now, please **upload files manually**."
|
| 1006 |
+
)
|
| 1007 |
+
|
| 1008 |
+
return [], "", f"❌ Failed to load dataset: {error_msg}\n\nPlease upload files manually."
|
| 1009 |
|
| 1010 |
except ImportError as e:
|
| 1011 |
logger.error(f"datasets package not installed: {e}")
|