Song commited on
Commit
7e7da62
·
1 Parent(s): b55a0b9

refactor: remove local PDF files and use HF Dataset for knowledge base

Browse files
Files changed (2) hide show
  1. rag.py +81 -18
  2. requirements.txt +1 -0
rag.py CHANGED
@@ -16,6 +16,7 @@ from langchain_community.vectorstores import SupabaseVectorStore
16
  from langchain_text_splitters import RecursiveCharacterTextSplitter
17
  from supabase import create_client, Client
18
  from langchain_core.documents import Document
 
19
  from cache import DocumentCache, document_cache, cache_result
20
 
21
  # Configure logging
@@ -73,20 +74,15 @@ class RAGService:
73
  async def load_knowledge_base(self, data_dir: str = "backend/data") -> Dict[str, Any]:
74
  """
75
  Load and process documents from the data directory.
 
76
 
77
  Args:
78
  data_dir: Path to directory containing documents
79
 
80
  Returns:
81
- Dictionary with processing results
82
  """
83
- logger.info(f"Loading knowledge base from {data_dir}")
84
-
85
  data_path = Path(data_dir)
86
- if not data_path.exists():
87
- raise ValueError(f"Data directory {data_dir} does not exist")
88
-
89
- # Track processing results
90
  results = {
91
  "total_files": 0,
92
  "processed_files": 0,
@@ -95,30 +91,97 @@ class RAGService:
95
  "errors": []
96
  }
97
 
98
- # Find all PDF and MD files
99
- pdf_files = list(data_path.glob("**/*.pdf"))
100
- md_files = list(data_path.glob("**/*.md"))
101
- all_files = pdf_files + md_files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
 
 
 
 
 
103
  results["total_files"] = len(all_files)
104
 
105
  if not all_files:
106
- logger.warning(f"No PDF or MD files found in {data_dir}")
107
  return results
108
 
109
- logger.info(f"Found {len(all_files)} files to process")
110
-
111
- # Process each file
112
  for file_path in all_files:
113
  try:
114
- await self._process_file(file_path, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
- error_msg = f"Failed to process {file_path}: {str(e)}"
117
  logger.error(error_msg)
118
  results["errors"].append(error_msg)
119
  results["failed_files"] += 1
120
 
121
- logger.info(f"Knowledge base loading completed: {results}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return results
123
 
124
  async def _process_file(self, file_path: Path, results: Dict[str, Any]) -> None:
 
16
  from langchain_text_splitters import RecursiveCharacterTextSplitter
17
  from supabase import create_client, Client
18
  from langchain_core.documents import Document
19
+ from huggingface_hub import snapshot_download
20
  from cache import DocumentCache, document_cache, cache_result
21
 
22
  # Configure logging
 
74
  async def load_knowledge_base(self, data_dir: str = "backend/data") -> Dict[str, Any]:
75
  """
76
  Load and process documents from the data directory.
77
+ If local directory is empty, download from Hugging Face Dataset.
78
 
79
  Args:
80
  data_dir: Path to directory containing documents
81
 
82
  Returns:
83
+ Dictionary with loading statistics
84
  """
 
 
85
  data_path = Path(data_dir)
 
 
 
 
86
  results = {
87
  "total_files": 0,
88
  "processed_files": 0,
 
91
  "errors": []
92
  }
93
 
94
+ # 如果本地資料夾不存在或裡面沒有 PDF/MD 檔案,就從 HF Dataset 下載
95
+ if not data_path.exists() or not any(data_path.glob("*.pdf")) and not any(data_path.glob("*.md")):
96
+ logger.info("Local knowledge base empty or missing. Downloading from Hugging Face Dataset...")
97
+ data_path.mkdir(parents=True, exist_ok=True)
98
+ try:
99
+ snapshot_download(
100
+ repo_id="pcreem/dietinstruction", # ← 這裡一定要正確!
101
+ local_dir=data_dir,
102
+ local_dir_use_symlinks=False,
103
+ repo_type="dataset",
104
+ revision="main",
105
+ allow_patterns=["*.pdf", "*.md", "*.txt"], # 只下載我們需要的檔案
106
+ tqdm_class=None # 避免日誌衝突
107
+ )
108
+ logger.info(f"Successfully downloaded knowledge base to {data_dir}")
109
+ except Exception as e:
110
+ error_msg = f"Failed to download from Hugging Face Dataset: {str(e)}"
111
+ logger.error(error_msg)
112
+ results["errors"].append(error_msg)
113
+ # 如果下載失敗,至少確保資料夾存在
114
+ data_path.mkdir(parents=True, exist_ok=True)
115
+ else:
116
+ logger.info(f"Using existing local knowledge base at {data_dir}")
117
+
118
+ # ===== 以下是原本的檔案載入邏輯(不需改動太多)=====
119
+ documents: List[Document] = []
120
 
121
+ # Supported file types
122
+ pdf_files = list(data_path.glob("*.pdf"))
123
+ md_files = list(data_path.glob("*.md"))
124
+ txt_files = list(data_path.glob("*.txt"))
125
+
126
+ all_files = pdf_files + md_files + txt_files
127
  results["total_files"] = len(all_files)
128
 
129
  if not all_files:
130
+ logger.warning("No documents found in knowledge base directory")
131
  return results
132
 
 
 
 
133
  for file_path in all_files:
134
  try:
135
+ logger.info(f"Processing file: {file_path.name}")
136
+ if file_path.suffix == ".pdf":
137
+ loader = PyPDFLoader(str(file_path))
138
+ elif file_path.suffix == ".md":
139
+ loader = UnstructuredMarkdownLoader(str(file_path))
140
+ elif file_path.suffix == ".txt":
141
+ # Simple text loader
142
+ with open(file_path, "r", encoding="utf-8") as f:
143
+ content = f.read()
144
+ documents.append(Document(
145
+ page_content=content,
146
+ metadata={"file_name": file_path.name, "source": str(file_path)}
147
+ ))
148
+ results["processed_files"] += 1
149
+ continue
150
+ else:
151
+ continue
152
+
153
+ docs = loader.load()
154
+ for doc in docs:
155
+ doc.metadata.update({
156
+ "file_name": file_path.name,
157
+ "source": str(file_path)
158
+ })
159
+ documents.extend(docs)
160
+ results["processed_files"] += 1
161
+
162
  except Exception as e:
163
+ error_msg = f"Error processing {file_path.name}: {str(e)}"
164
  logger.error(error_msg)
165
  results["errors"].append(error_msg)
166
  results["failed_files"] += 1
167
 
168
+ # Split documents into chunks
169
+ if documents:
170
+ chunks = self.text_splitter.split_documents(documents)
171
+ results["total_chunks"] = len(chunks)
172
+ logger.info(f"Created {len(chunks)} document chunks")
173
+
174
+ # Add to vector store (with upsert)
175
+ try:
176
+ self.vector_store.add_documents(chunks)
177
+ logger.info(f"Successfully added {len(chunks)} chunks to vector store")
178
+ except Exception as e:
179
+ error_msg = f"Error adding documents to vector store: {str(e)}"
180
+ logger.error(error_msg)
181
+ results["errors"].append(error_msg)
182
+ else:
183
+ logger.warning("No documents were successfully loaded")
184
+
185
  return results
186
 
187
  async def _process_file(self, file_path: Path, results: Dict[str, Any]) -> None:
requirements.txt CHANGED
@@ -19,6 +19,7 @@ tiktoken
19
  supabase
20
  stripe
21
  httpx
 
22
 
23
  # Utilities
24
  python-dotenv
 
19
  supabase
20
  stripe
21
  httpx
22
+ huggingface_hub
23
 
24
  # Utilities
25
  python-dotenv