Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
ae2e9ee
1
Parent(s):
47b875d
delete vectara
Browse files- kaggle_simple_multimodal.py +49 -2
kaggle_simple_multimodal.py
CHANGED
|
@@ -108,6 +108,42 @@ def query_with_multimodal(rag_system: AdaptiveRAGSystem, query: str, image_paths
|
|
| 108 |
print(f"❌ 查询失败: {e}")
|
| 109 |
return None
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def main():
|
| 112 |
"""主函数"""
|
| 113 |
print("🚀 Kaggle简化多模态测试")
|
|
@@ -116,12 +152,15 @@ def main():
|
|
| 116 |
# 设置环境
|
| 117 |
setup_kaggle_environment()
|
| 118 |
|
| 119 |
-
#
|
|
|
|
|
|
|
|
|
|
| 120 |
working_dir = '/kaggle/working'
|
| 121 |
pdf_files = [f for f in os.listdir(working_dir) if f.endswith('.pdf')]
|
| 122 |
image_files = [f for f in os.listdir(working_dir) if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])]
|
| 123 |
|
| 124 |
-
print(f"\n📁
|
| 125 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 126 |
for pdf in pdf_files:
|
| 127 |
print(f" * {pdf}")
|
|
@@ -130,6 +169,14 @@ def main():
|
|
| 130 |
for img in image_files:
|
| 131 |
print(f" * {img}")
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
# 处理文件
|
| 134 |
pdf_path = os.path.join(working_dir, pdf_files[0]) if pdf_files else None
|
| 135 |
image_paths = [os.path.join(working_dir, img) for img in image_files] if image_files else None
|
|
|
|
| 108 |
print(f"❌ 查询失败: {e}")
|
| 109 |
return None
|
| 110 |
|
| 111 |
+
def scan_and_copy_files():
|
| 112 |
+
"""扫描 /kaggle/input/ 并复制文件到 /kaggle/working/"""
|
| 113 |
+
import shutil
|
| 114 |
+
|
| 115 |
+
input_dir = '/kaggle/input'
|
| 116 |
+
working_dir = '/kaggle/working'
|
| 117 |
+
|
| 118 |
+
if not os.path.exists(input_dir):
|
| 119 |
+
print("⚠️ /kaggle/input/ 目录不存在,跳过文件扫描")
|
| 120 |
+
return
|
| 121 |
+
|
| 122 |
+
print("📂 扫描 /kaggle/input/ 目录...")
|
| 123 |
+
|
| 124 |
+
copied_pdfs = []
|
| 125 |
+
copied_images = []
|
| 126 |
+
|
| 127 |
+
# 递归扫描所有文件
|
| 128 |
+
for root, dirs, files in os.walk(input_dir):
|
| 129 |
+
for file in files:
|
| 130 |
+
src = os.path.join(root, file)
|
| 131 |
+
dst = os.path.join(working_dir, file)
|
| 132 |
+
|
| 133 |
+
if file.endswith('.pdf'):
|
| 134 |
+
shutil.copy(src, dst)
|
| 135 |
+
copied_pdfs.append(file)
|
| 136 |
+
print(f" ✅ 复制 PDF: {file}")
|
| 137 |
+
elif any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
|
| 138 |
+
shutil.copy(src, dst)
|
| 139 |
+
copied_images.append(file)
|
| 140 |
+
print(f" ✅ 复制图片: {file}")
|
| 141 |
+
|
| 142 |
+
if copied_pdfs or copied_images:
|
| 143 |
+
print(f"\n📁 复制完成: {len(copied_pdfs)} 个 PDF, {len(copied_images)} 张图片")
|
| 144 |
+
else:
|
| 145 |
+
print("⚠️ 未找到 PDF 或图片文件")
|
| 146 |
+
|
| 147 |
def main():
|
| 148 |
"""主函数"""
|
| 149 |
print("🚀 Kaggle简化多模态测试")
|
|
|
|
| 152 |
# 设置环境
|
| 153 |
setup_kaggle_environment()
|
| 154 |
|
| 155 |
+
# 从 /kaggle/input/ 复制文件到 /kaggle/working/
|
| 156 |
+
scan_and_copy_files()
|
| 157 |
+
|
| 158 |
+
# 检查文件
|
| 159 |
working_dir = '/kaggle/working'
|
| 160 |
pdf_files = [f for f in os.listdir(working_dir) if f.endswith('.pdf')]
|
| 161 |
image_files = [f for f in os.listdir(working_dir) if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])]
|
| 162 |
|
| 163 |
+
print(f"\n📁 /kaggle/working/ 中的文件:")
|
| 164 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 165 |
for pdf in pdf_files:
|
| 166 |
print(f" * {pdf}")
|
|
|
|
| 169 |
for img in image_files:
|
| 170 |
print(f" * {img}")
|
| 171 |
|
| 172 |
+
if not pdf_files and not image_files:
|
| 173 |
+
print("\n💡 使用说明:")
|
| 174 |
+
print(" 1. 在 Kaggle Notebook 右侧点击 '+ Add data'")
|
| 175 |
+
print(" 2. 选择 'Upload' 标签")
|
| 176 |
+
print(" 3. 上传你的 PDF 和图片文件")
|
| 177 |
+
print(" 4. 重新运行此脚本")
|
| 178 |
+
return
|
| 179 |
+
|
| 180 |
# 处理文件
|
| 181 |
pdf_path = os.path.join(working_dir, pdf_files[0]) if pdf_files else None
|
| 182 |
image_paths = [os.path.join(working_dir, img) for img in image_files] if image_files else None
|