duqing2026 commited on
Commit
b6bdef5
·
1 Parent(s): c2238c0

Feat: implement dataset-based data persistence

Browse files
Dockerfile CHANGED
@@ -57,9 +57,16 @@ COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
57
  # Create directory if it doesn't exist
58
  # We copy existing stores so the demo works out of the box
59
  COPY --from=builder --chown=nextjs:nodejs /app/vector_store ./vector_store
 
 
60
  # Copy source documents
61
  COPY --from=builder --chown=nextjs:nodejs /app/data ./data
62
 
 
 
 
 
 
63
  USER nextjs
64
 
65
  EXPOSE 7860
@@ -67,4 +74,4 @@ EXPOSE 7860
67
  ENV PORT 7860
68
  ENV HOSTNAME "0.0.0.0"
69
 
70
- CMD ["node", "server.js"]
 
57
  # Create directory if it doesn't exist
58
  # We copy existing stores so the demo works out of the box
59
  COPY --from=builder --chown=nextjs:nodejs /app/vector_store ./vector_store
60
+ # Copy database if it exists
61
+ COPY --from=builder --chown=nextjs:nodejs /app/rag-kb.db ./rag-kb.db
62
  # Copy source documents
63
  COPY --from=builder --chown=nextjs:nodejs /app/data ./data
64
 
65
+ # Copy scripts for data sync
66
+ COPY --chown=nextjs:nodejs scripts/download_data.py ./scripts/
67
+ COPY --chown=nextjs:nodejs scripts/start.sh ./scripts/
68
+ RUN chmod +x ./scripts/start.sh
69
+
70
  USER nextjs
71
 
72
  EXPOSE 7860
 
74
  ENV PORT 7860
75
  ENV HOSTNAME "0.0.0.0"
76
 
77
+ CMD ["./scripts/start.sh"]
scripts/download_data.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import sys
4
+ from huggingface_hub import snapshot_download
5
+
6
+ def main():
7
+ repo_id = os.environ.get("HF_DATASET_REPO")
8
+ if not repo_id:
9
+ print("HF_DATASET_REPO environment variable not set. Skipping data download.")
10
+ return
11
+
12
+ print(f"Downloading data from dataset: {repo_id}...")
13
+ try:
14
+ # Download to a temporary directory
15
+ local_dir = snapshot_download(repo_id=repo_id, repo_type="dataset", allow_patterns=["rag-kb.db", "vector_store/*"])
16
+
17
+ print(f"Data downloaded to: {local_dir}")
18
+
19
+ # Move rag-kb.db
20
+ db_source = os.path.join(local_dir, "rag-kb.db")
21
+ if os.path.exists(db_source):
22
+ if os.path.exists("rag-kb.db"):
23
+ print("Backing up existing rag-kb.db...")
24
+ shutil.move("rag-kb.db", "rag-kb.db.bak")
25
+
26
+ print("Restoring rag-kb.db...")
27
+ shutil.copy(db_source, ".")
28
+ else:
29
+ print("Warning: rag-kb.db not found in dataset.")
30
+
31
+ # Move vector_store
32
+ vs_source = os.path.join(local_dir, "vector_store")
33
+ if os.path.exists(vs_source):
34
+ if os.path.exists("vector_store"):
35
+ print("Removing existing vector_store...")
36
+ shutil.rmtree("vector_store")
37
+
38
+ print("Restoring vector_store...")
39
+ shutil.copytree(vs_source, "vector_store")
40
+ else:
41
+ print("Warning: vector_store not found in dataset.")
42
+
43
+ print("✅ Data restoration completed.")
44
+
45
+ except Exception as e:
46
+ print(f"Error downloading data: {e}")
47
+ # Don't exit with error, allow app to start empty if download fails
48
+ pass
49
+
50
+ if __name__ == "__main__":
51
+ main()
scripts/start.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Try to download data if configured
4
+ if [ -n "$HF_DATASET_REPO" ]; then
5
+ echo "Attempting to restore data from Hugging Face Dataset..."
6
+ python3 scripts/download_data.py
7
+ else
8
+ echo "HF_DATASET_REPO not set. Starting with local/empty data."
9
+ fi
10
+
11
+ # Start the application
12
+ echo "Starting Next.js server..."
13
+ exec node server.js
scripts/upload_data.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from huggingface_hub import HfApi, create_repo
4
+
5
+ def main():
6
+ print("=== Rag-KB Data Uploader ===")
7
+ print("This script helps you upload your local database and vector store to a Hugging Face Dataset.")
8
+ print("This ensures your data persists across Space deployments.\n")
9
+
10
+ # Get Token
11
+ token = os.environ.get("HF_TOKEN")
12
+ if not token:
13
+ token = input("Please enter your Hugging Face Write Token: ").strip()
14
+
15
+ if not token:
16
+ print("Error: Token is required.")
17
+ sys.exit(1)
18
+
19
+ # Get Repo ID
20
+ default_repo = "duqing2026/rag-kb-data"
21
+ repo_id = input(f"Enter Dataset Repo ID (default: {default_repo}): ").strip() or default_repo
22
+
23
+ api = HfApi(token=token)
24
+
25
+ # Create Repo if not exists
26
+ print(f"\nChecking/Creating dataset repository: {repo_id}...")
27
+ try:
28
+ create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=token, private=True)
29
+ print("Repository ready.")
30
+ except Exception as e:
31
+ print(f"Error creating repository: {e}")
32
+ sys.exit(1)
33
+
34
+ # Upload files
35
+ files_to_upload = [
36
+ ("rag-kb.db", "rag-kb.db"),
37
+ ]
38
+
39
+ # Add vector store files
40
+ vector_store_path = "vector_store"
41
+ if os.path.exists(vector_store_path):
42
+ for root, dirs, files in os.walk(vector_store_path):
43
+ for file in files:
44
+ local_path = os.path.join(root, file)
45
+ # Keep the directory structure relative to vector_store
46
+ # We want vector_store/xxx in the repo
47
+ repo_path = local_path
48
+ files_to_upload.append((local_path, repo_path))
49
+ else:
50
+ print("Warning: vector_store directory not found.")
51
+
52
+ print(f"\nStarting upload of {len(files_to_upload)} files...")
53
+
54
+ for local_path, repo_path in files_to_upload:
55
+ if os.path.exists(local_path):
56
+ print(f"Uploading {local_path} -> {repo_path}...")
57
+ try:
58
+ api.upload_file(
59
+ path_or_fileobj=local_path,
60
+ path_in_repo=repo_path,
61
+ repo_id=repo_id,
62
+ repo_type="dataset",
63
+ token=token
64
+ )
65
+ except Exception as e:
66
+ print(f"Failed to upload {local_path}: {e}")
67
+ else:
68
+ print(f"Skipping {local_path} (not found)")
69
+
70
+ print("\n✅ Upload completed successfully!")
71
+ print(f"Please set the following environment variable in your Hugging Face Space settings:")
72
+ print(f"HF_DATASET_REPO={repo_id}")
73
+
74
+ if __name__ == "__main__":
75
+ main()