Spaces:

TilanB
/

SmartDocAI

Sleeping

App Files Files Community

TilanB commited on Jan 1

Commit

94d209d

verified ·

1 Parent(s): c6f51a5

Update main.py

Browse files

Files changed (1) hide show

main.py +42 -27

main.py CHANGED Viewed

@@ -945,41 +945,56 @@ setInterval(tick, 500);
                             # Search through dataset rows
                             for row in ds:
                                 # Check if this row contains our file
-                                # Adjust field names based on your dataset structure
-                                row_filename = row.get('filename') or row.get('name') or row.get('path', '')
-                                if os.path.basename(str(row_filename)) == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
-                                    # Handle different dataset column formats
-                                    if 'content' in row and row['content']:
-                                        with open(temp_file_path, 'wb') as f:
-                                            f.write(row['content'])
-                                    elif 'file' in row and row['file']:
-                                        file_obj = row['file']
-                                        if isinstance(file_obj, dict) and 'bytes' in file_obj:
-                                            with open(temp_file_path, 'wb') as f:
-                                                f.write(file_obj['bytes'])
-                                        elif isinstance(file_obj, bytes):
-                                            with open(temp_file_path, 'wb') as f:
-                                                f.write(file_obj)
-                                    elif 'data' in row and row['data']:
-                                        with open(temp_file_path, 'wb') as f:
-                                            f.write(row['data'])
-                                    else:
-                                        logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
-                                        continue
-                                    copied_files.append(temp_file_path)
-                                    file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
-                                    file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
-                                    file_found = True
-                                    logger.info(f"Successfully extracted {filename} from dataset")
-                                    break
                             if not file_found:
                                 logger.warning(f"File {filename} not found in dataset rows")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
                         if not copied_files:
                             if len(ds) > 0:

                             # Search through dataset rows
                             for row in ds:
                                 # Check if this row contains our file
+                                # The dataset has a 'pdf' column with file paths
+                                row_filename = row.get('pdf', '')
+                                # Match by filename (the PDF column stores filenames)
+                                if isinstance(row_filename, str) and os.path.basename(row_filename) == filename:
                                     temp_file_path = os.path.join(temp_dir, filename)
+                                    # The 'pdf' column contains the actual file path/data
+                                    # Datasets library auto-loads files from the 'pdf' column
+                                    pdf_data = row.get('pdf')
+                                    if pdf_data:
+                                        try:
+                                            # Check if it's already bytes
+                                            if isinstance(pdf_data, bytes):
+                                                with open(temp_file_path, 'wb') as f:
+                                                    f.write(pdf_data)
+                                            # Check if it's a dict with 'bytes' key (common format)
+                                            elif isinstance(pdf_data, dict):
+                                                if 'bytes' in pdf_data:
+                                                    with open(temp_file_path, 'wb') as f:
+                                                        f.write(pdf_data['bytes'])
+                                                elif 'path' in pdf_data:
+                                                    # It's a file path, copy the file
+                                                    import shutil
+                                                    shutil.copy2(pdf_data['path'], temp_file_path)
+                                            # Try to read as file path string
+                                            elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
+                                                import shutil
+                                                shutil.copy2(pdf_data, temp_file_path)
+                                            else:
+                                                logger.error(f"Unknown PDF data format: {type(pdf_data)}")
+                                                continue
+                                            copied_files.append(temp_file_path)
+                                            file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
+                                            file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
+                                            file_found = True
+                                            logger.info(f"Successfully extracted {filename} from dataset")
+                                            break
+                                        except Exception as ex:
+                                            logger.error(f"Failed to extract {filename}: {ex}")
+                                            continue
                             if not file_found:
                                 logger.warning(f"File {filename} not found in dataset rows")
                                 file_info_text += f"⚠️ {filename} - Not found in dataset\n"
+                                # Debug: print available filenames
+                                available = [row.get('pdf', 'N/A') for row in ds]
+                                logger.debug(f"Available files in dataset: {available}")
                         if not copied_files:
                             if len(ds) > 0: