Spaces:

TilanB
/

SmartDocAI

Sleeping

App Files Files Community

TilanB commited on Jan 1

Commit

ca6762d

verified ·

1 Parent(s): 9d7baa4

Update main.py

Browse files

Files changed (1) hide show

main.py +73 -34

main.py CHANGED Viewed

@@ -942,11 +942,16 @@ setInterval(tick, 500);
                             first_row = ds[0]
                             pdf_data = first_row.get('pdf', None)
                             logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
-                            if isinstance(pdf_data, dict):
                                 logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
                                 if 'path' in pdf_data:
-                                    logger.info(f"PDF path example: {pdf_data.get('path', 'N/A')}")
                         # Extract requested files from dataset rows
                         for file_path in file_names:
                             filename = os.path.basename(file_path)
@@ -963,14 +968,27 @@ setInterval(tick, 500);
                                     continue
                                 # Extract the actual filename from the pdf data
-                                # HF datasets library returns file objects as dicts with 'path' key
-                                if isinstance(pdf_data, dict):
                                     row_filename = pdf_data.get('path', '')
                                 elif isinstance(pdf_data, str):
                                     row_filename = pdf_data
-                                else:
-                                    # Try to get path attribute (for other formats)
-                                    row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
                                 row_basename = os.path.basename(str(row_filename))
                                 logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
@@ -981,51 +999,72 @@ setInterval(tick, 500);
                                     logger.info(f"Found match! Extracting {filename}...")
                                     try:
-                                        # Handle different data formats from HF datasets
-                                        if isinstance(pdf_data, dict):
                                             if 'bytes' in pdf_data and pdf_data['bytes']:
-                                                # Most common: dict with 'bytes' key
                                                 with open(temp_file_path, 'wb') as f:
                                                     f.write(pdf_data['bytes'])
-                                                logger.info(f"Wrote {len(pdf_data['bytes'])} bytes to {temp_file_path}")
                                             elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
-                                                # File path exists on disk (HF caches files)
                                                 shutil.copy2(pdf_data['path'], temp_file_path)
-                                                logger.info(f"Copied from cache: {pdf_data['path']}")
-                                            else:
-                                                logger.error(f"Dict has no usable data: {list(pdf_data.keys())}")
-                                                continue
                                         elif isinstance(pdf_data, bytes):
-                                            # Direct bytes
                                             with open(temp_file_path, 'wb') as f:
                                                 f.write(pdf_data)
                                         elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
-                                            # File path string
                                             shutil.copy2(pdf_data, temp_file_path)
-                                        else:
-                                            logger.error(f"Unknown PDF data type: {type(pdf_data)}")
-                                            continue
-                                        copied_files.append(temp_file_path)
-                                        file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
-                                        file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
-                                        file_found = True
-                                        logger.info(f"✅ Successfully extracted {filename}")
-                                        break
                                     except Exception as ex:
                                         logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
                                         continue
                             if not file_found:
-                                # Debug: show what's actually in the dataset
                                 logger.warning(f"❌ File {filename} not found in dataset rows")
                                 for idx, row in enumerate(ds):
                                     pdf_data = row.get('pdf', None)
-                                    if pdf_data:
-                                        if isinstance(pdf_data, dict):
-                                            available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
-                                        else:
-                                            available_name = str(type(pdf_data))
                                         logger.info(f"  Available file in row {idx}: '{available_name}'")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"

                             first_row = ds[0]
                             pdf_data = first_row.get('pdf', None)
                             logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
+                            # Handle different types
+                            if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
+                                # pdfplumber PDF object
+                                logger.info(f"PDF is pdfplumber object, stream path: {pdf_data.stream.name}")
+                            elif isinstance(pdf_data, dict):
                                 logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
                                 if 'path' in pdf_data:
+                                    logger.info(f"PDF path: {pdf_data.get('path', 'N/A')}")
                         # Extract requested files from dataset rows
                         for file_path in file_names:
                             filename = os.path.basename(file_path)
                                     continue
                                 # Extract the actual filename from the pdf data
+                                # HF datasets with PDF files can return different types:
+                                # 1. pdfplumber.pdf.PDF objects (when using pdf feature type)
+                                # 2. dict with 'path' and 'bytes' keys
+                                # 3. str path
+                                # 4. bytes directly
+                                row_filename = ""
+                                # Check for pdfplumber PDF object (has .stream.name attribute)
+                                if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
+                                    row_filename = pdf_data.stream.name
+                                    logger.debug(f"Got filename from pdfplumber stream: {row_filename}")
+                                # Check for pdfplumber PDF object with path attribute
+                                elif hasattr(pdf_data, 'path'):
+                                    row_filename = pdf_data.path
+                                # Check for dict format
+                                elif isinstance(pdf_data, dict):
                                     row_filename = pdf_data.get('path', '')
+                                # Check for string path
                                 elif isinstance(pdf_data, str):
                                     row_filename = pdf_data
                                 row_basename = os.path.basename(str(row_filename))
                                 logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
                                     logger.info(f"Found match! Extracting {filename}...")
                                     try:
+                                        extracted = False
+                                        # Handle pdfplumber PDF object
+                                        if hasattr(pdf_data, 'stream'):
+                                            # Get the file path from pdfplumber's stream
+                                            source_path = pdf_data.stream.name
+                                            if source_path and os.path.exists(source_path):
+                                                shutil.copy2(source_path, temp_file_path)
+                                                logger.info(f"Copied from pdfplumber stream: {source_path}")
+                                                extracted = True
+                                            else:
+                                                # Try to read bytes from stream
+                                                try:
+                                                    pdf_data.stream.seek(0)
+                                                    pdf_bytes = pdf_data.stream.read()
+                                                    with open(temp_file_path, 'wb') as f:
+                                                        f.write(pdf_bytes)
+                                                    logger.info(f"Wrote {len(pdf_bytes)} bytes from pdfplumber stream")
+                                                    extracted = True
+                                                except Exception as stream_err:
+                                                    logger.warning(f"Could not read stream: {stream_err}")
+                                        # Handle dict format
+                                        elif isinstance(pdf_data, dict):
                                             if 'bytes' in pdf_data and pdf_data['bytes']:
                                                 with open(temp_file_path, 'wb') as f:
                                                     f.write(pdf_data['bytes'])
+                                                logger.info(f"Wrote {len(pdf_data['bytes'])} bytes")
+                                                extracted = True
                                             elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
                                                 shutil.copy2(pdf_data['path'], temp_file_path)
+                                                logger.info(f"Copied from dict path: {pdf_data['path']}")
+                                                extracted = True
+                                        # Handle bytes directly
                                         elif isinstance(pdf_data, bytes):
                                             with open(temp_file_path, 'wb') as f:
                                                 f.write(pdf_data)
+                                            extracted = True
+                                        # Handle string path
                                         elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
                                             shutil.copy2(pdf_data, temp_file_path)
+                                            extracted = True
+                                        if extracted and os.path.exists(temp_file_path):
+                                            copied_files.append(temp_file_path)
+                                            file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
+                                            file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
+                                            file_found = True
+                                            logger.info(f"✅ Successfully extracted {filename}")
+                                            break
+                                        else:
+                                            logger.error(f"Could not extract file: {type(pdf_data)}")
                                     except Exception as ex:
                                         logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
                                         continue
                             if not file_found:
                                 logger.warning(f"❌ File {filename} not found in dataset rows")
+                                # Debug: show what's available
                                 for idx, row in enumerate(ds):
                                     pdf_data = row.get('pdf', None)
+                                    if pdf_data and hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
+                                        available_name = os.path.basename(str(pdf_data.stream.name))
                                         logger.info(f"  Available file in row {idx}: '{available_name}'")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"