Spaces:

TilanB
/

SmartDocAI

Sleeping

App Files Files Community

TilanB commited on Jan 1

Commit

9d7baa4

verified ·

1 Parent(s): 80f44a7

Update main.py

Browse files

Files changed (1) hide show

main.py +78 -45

main.py CHANGED Viewed

@@ -937,65 +937,98 @@ setInterval(tick, 500);
                         # Create temp directory for files
                         temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
                         # Extract requested files from dataset rows
                         for file_path in file_names:
                             filename = os.path.basename(file_path)
                             file_found = False
                             # Search through dataset rows
-                            for row in ds:
-                                # Check if this row contains our file
-                                # The dataset has a 'pdf' column with file paths
-                                row_filename = row.get('pdf', '')
-                                # Match by filename (the PDF column stores filenames)
-                                if isinstance(row_filename, str) and os.path.basename(row_filename) == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
-                                    # The 'pdf' column contains the actual file path/data
-                                    # Datasets library auto-loads files from the 'pdf' column
-                                    pdf_data = row.get('pdf')
-                                    if pdf_data:
-                                        try:
-                                            # Check if it's already bytes
-                                            if isinstance(pdf_data, bytes):
                                                 with open(temp_file_path, 'wb') as f:
-                                                    f.write(pdf_data)
-                                            # Check if it's a dict with 'bytes' key (common format)
-                                            elif isinstance(pdf_data, dict):
-                                                if 'bytes' in pdf_data:
-                                                    with open(temp_file_path, 'wb') as f:
-                                                        f.write(pdf_data['bytes'])
-                                                elif 'path' in pdf_data:
-                                                    # It's a file path, copy the file
-                                                    import shutil
-                                                    shutil.copy2(pdf_data['path'], temp_file_path)
-                                            # Try to read as file path string
-                                            elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
-                                                import shutil
-                                                shutil.copy2(pdf_data, temp_file_path)
                                             else:
-                                                logger.error(f"Unknown PDF data format: {type(pdf_data)}")
                                                 continue
-                                            copied_files.append(temp_file_path)
-                                            file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
-                                            file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
-                                            file_found = True
-                                            logger.info(f"Successfully extracted {filename} from dataset")
-                                            break
-                                        except Exception as ex:
-                                            logger.error(f"Failed to extract {filename}: {ex}")
                                             continue
                             if not file_found:
-                                logger.warning(f"File {filename} not found in dataset rows")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
-                                # Debug: print available filenames
-                                available = [row.get('pdf', 'N/A') for row in ds]
-                                logger.debug(f"Available files in dataset: {available}")
                         if not copied_files:
                             if len(ds) > 0:
                                 logger.error(f"Dataset structure: {list(ds[0].keys())}")
@@ -1069,14 +1102,14 @@ setInterval(tick, 500);
     if is_hf_space:
         # Hugging Face Spaces configuration
         logger.info("Running on Hugging Face Spaces")
-        demo.launch(theme=gr.themes.Soft(),server_name="0.0.0.0", server_port=7860, css=css, js=js)
     else:
         # Local development configuration
         configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
         server_port = _find_open_port(configured_port)
         logger.info(f"Launching Gradio on port {server_port}")
         logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
-        demo.launch(theme=gr.themes.Soft(),server_port=server_port, share=False, css=css, js=js)
 if __name__ == "__main__":

                         # Create temp directory for files
                         temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
+                        # Debug: Log first row structure
+                        if len(ds) > 0:
+                            first_row = ds[0]
+                            pdf_data = first_row.get('pdf', None)
+                            logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
+                            if isinstance(pdf_data, dict):
+                                logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
+                                if 'path' in pdf_data:
+                                    logger.info(f"PDF path example: {pdf_data.get('path', 'N/A')}")
                         # Extract requested files from dataset rows
                         for file_path in file_names:
                             filename = os.path.basename(file_path)
                             file_found = False
+                            logger.info(f"Looking for file: {filename}")
                             # Search through dataset rows
+                            for row_idx, row in enumerate(ds):
+                                # The 'pdf' column contains file objects from HF datasets
+                                pdf_data = row.get('pdf', None)
+                                if pdf_data is None:
+                                    continue
+                                # Extract the actual filename from the pdf data
+                                # HF datasets library returns file objects as dicts with 'path' key
+                                if isinstance(pdf_data, dict):
+                                    row_filename = pdf_data.get('path', '')
+                                elif isinstance(pdf_data, str):
+                                    row_filename = pdf_data
+                                else:
+                                    # Try to get path attribute (for other formats)
+                                    row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
+                                row_basename = os.path.basename(str(row_filename))
+                                logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
+                                # Match by filename
+                                if row_basename == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
+                                    logger.info(f"Found match! Extracting {filename}...")
+                                    try:
+                                        # Handle different data formats from HF datasets
+                                        if isinstance(pdf_data, dict):
+                                            if 'bytes' in pdf_data and pdf_data['bytes']:
+                                                # Most common: dict with 'bytes' key
                                                 with open(temp_file_path, 'wb') as f:
+                                                    f.write(pdf_data['bytes'])
+                                                logger.info(f"Wrote {len(pdf_data['bytes'])} bytes to {temp_file_path}")
+                                            elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
+                                                # File path exists on disk (HF caches files)
+                                                shutil.copy2(pdf_data['path'], temp_file_path)
+                                                logger.info(f"Copied from cache: {pdf_data['path']}")
                                             else:
+                                                logger.error(f"Dict has no usable data: {list(pdf_data.keys())}")
                                                 continue
+                                        elif isinstance(pdf_data, bytes):
+                                            # Direct bytes
+                                            with open(temp_file_path, 'wb') as f:
+                                                f.write(pdf_data)
+                                        elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
+                                            # File path string
+                                            shutil.copy2(pdf_data, temp_file_path)
+                                        else:
+                                            logger.error(f"Unknown PDF data type: {type(pdf_data)}")
                                             continue
+                                        copied_files.append(temp_file_path)
+                                        file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
+                                        file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
+                                        file_found = True
+                                        logger.info(f"✅ Successfully extracted {filename}")
+                                        break
+                                    except Exception as ex:
+                                        logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
+                                        continue
                             if not file_found:
+                                # Debug: show what's actually in the dataset
+                                logger.warning(f"❌ File {filename} not found in dataset rows")
+                                for idx, row in enumerate(ds):
+                                    pdf_data = row.get('pdf', None)
+                                    if pdf_data:
+                                        if isinstance(pdf_data, dict):
+                                            available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
+                                        else:
+                                            available_name = str(type(pdf_data))
+                                        logger.info(f"  Available file in row {idx}: '{available_name}'")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
                         if not copied_files:
                             if len(ds) > 0:
                                 logger.error(f"Dataset structure: {list(ds[0].keys())}")
     if is_hf_space:
         # Hugging Face Spaces configuration
         logger.info("Running on Hugging Face Spaces")
+        demo.launch(theme=gr.themes.Soft(), server_name="0.0.0.0", server_port=7860, css=css, js=js)
     else:
         # Local development configuration
         configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
         server_port = _find_open_port(configured_port)
         logger.info(f"Launching Gradio on port {server_port}")
         logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
+        demo.launch(theme=gr.themes.Soft(), server_name="127.0.0.1", server_port=server_port, share=False, css=css, js=js)
 if __name__ == "__main__":