Spaces:

TilanB
/

SmartDocAI

Sleeping

App Files Files Community

TilanB commited on Jan 1

Commit

c6f51a5

verified ·

1 Parent(s): c23a6c5

Update main.py

Browse files

Files changed (1) hide show

main.py +34 -9

main.py CHANGED Viewed

@@ -907,9 +907,23 @@ setInterval(tick, 500);
                     copied_files = []
                     file_info_text = f"✅ Loaded: {example_key}\n\n"
-                    # Get HF token (optional for public datasets)
                     hf_token = os.environ.get("HF_TOKEN", None)
                     try:
                         # Load dataset - uses row-based structure
                         logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
@@ -934,16 +948,14 @@ setInterval(tick, 500);
                                 # Adjust field names based on your dataset structure
                                 row_filename = row.get('filename') or row.get('name') or row.get('path', '')
-                                if os.path.basename(row_filename) == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
                                     # Handle different dataset column formats
                                     if 'content' in row and row['content']:
-                                        # Binary content stored directly
                                         with open(temp_file_path, 'wb') as f:
                                             f.write(row['content'])
                                     elif 'file' in row and row['file']:
-                                        # File object with bytes
                                         file_obj = row['file']
                                         if isinstance(file_obj, dict) and 'bytes' in file_obj:
                                             with open(temp_file_path, 'wb') as f:
@@ -952,11 +964,10 @@ setInterval(tick, 500);
                                             with open(temp_file_path, 'wb') as f:
                                                 f.write(file_obj)
                                     elif 'data' in row and row['data']:
-                                        # Raw data field
                                         with open(temp_file_path, 'wb') as f:
                                             f.write(row['data'])
                                     else:
-                                        logger.warning(f"Unknown dataset format for {filename}, available fields: {list(row.keys())}")
                                         continue
                                     copied_files.append(temp_file_path)
@@ -971,16 +982,30 @@ setInterval(tick, 500);
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
                         if not copied_files:
-                            # Log dataset structure for debugging
                             if len(ds) > 0:
                                 logger.error(f"Dataset structure: {list(ds[0].keys())}")
-                            return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please check dataset structure or upload files manually."
                         return copied_files, question_text, file_info_text
                     except Exception as e:
                         logger.error(f"Failed to load dataset: {e}", exc_info=True)
-                        return [], "", f"❌ Failed to load dataset: {str(e)}\n\nPlease upload files manually."
                 except ImportError as e:
                     logger.error(f"datasets package not installed: {e}")

                     copied_files = []
                     file_info_text = f"✅ Loaded: {example_key}\n\n"
+                    # Get HF token - REQUIRED for gated datasets
                     hf_token = os.environ.get("HF_TOKEN", None)
+                    if not hf_token:
+                        logger.warning("HF_TOKEN not set - required for gated datasets")
+                        return [], "", (
+                            "❌ **Authentication Required**\n\n"
+                            "The example dataset is gated and requires authentication.\n\n"
+                            "**To fix:**\n"
+                            "1. Go to Space Settings → Repository secrets\n"
+                            "2. Add secret: `HF_TOKEN` = your Hugging Face token\n"
+                            "3. Restart the Space\n\n"
+                            "Or make your dataset public at:\n"
+                            "https://huggingface.co/datasets/TilanB/smartdoc-samples/settings\n\n"
+                            "For now, please **upload files manually**."
+                        )
                     try:
                         # Load dataset - uses row-based structure
                         logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
                                 # Adjust field names based on your dataset structure
                                 row_filename = row.get('filename') or row.get('name') or row.get('path', '')
+                                if os.path.basename(str(row_filename)) == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
                                     # Handle different dataset column formats
                                     if 'content' in row and row['content']:
                                         with open(temp_file_path, 'wb') as f:
                                             f.write(row['content'])
                                     elif 'file' in row and row['file']:
                                         file_obj = row['file']
                                         if isinstance(file_obj, dict) and 'bytes' in file_obj:
                                             with open(temp_file_path, 'wb') as f:
                                             with open(temp_file_path, 'wb') as f:
                                                 f.write(file_obj)
                                     elif 'data' in row and row['data']:
                                         with open(temp_file_path, 'wb') as f:
                                             f.write(row['data'])
                                     else:
+                                        logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
                                         continue
                                     copied_files.append(temp_file_path)
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
                         if not copied_files:
                             if len(ds) > 0:
                                 logger.error(f"Dataset structure: {list(ds[0].keys())}")
+                            return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please upload files manually."
                         return copied_files, question_text, file_info_text
                     except Exception as e:
+                        error_msg = str(e)
                         logger.error(f"Failed to load dataset: {e}", exc_info=True)
+                        # Check for gated dataset error
+                        if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
+                            return [], "", (
+                                "❌ **Dataset Access Denied**\n\n"
+                                "The dataset is gated and your token doesn't have access.\n\n"
+                                "**To fix:**\n"
+                                "1. Visit: https://huggingface.co/datasets/TilanB/smartdoc-samples\n"
+                                "2. Accept the access terms (if any)\n"
+                                "3. Make sure HF_TOKEN is set in Space secrets\n\n"
+                                "Or make your dataset public.\n\n"
+                                "For now, please **upload files manually**."
+                            )
+                        return [], "", f"❌ Failed to load dataset: {error_msg}\n\nPlease upload files manually."
                 except ImportError as e:
                     logger.error(f"datasets package not installed: {e}")