Spaces:

pcreem
/

silver

Sleeping

App Files Files Community

Song commited on Jan 7

Commit

7e7da62

1 Parent(s): b55a0b9

refactor: remove local PDF files and use HF Dataset for knowledge base

Browse files

Files changed (2) hide show

rag.py +81 -18
requirements.txt +1 -0

rag.py CHANGED Viewed

@@ -16,6 +16,7 @@ from langchain_community.vectorstores import SupabaseVectorStore
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from supabase import create_client, Client
 from langchain_core.documents import Document
 from cache import DocumentCache, document_cache, cache_result
 # Configure logging
@@ -73,20 +74,15 @@ class RAGService:
     async def load_knowledge_base(self, data_dir: str = "backend/data") -> Dict[str, Any]:
         """
         Load and process documents from the data directory.
         Args:
             data_dir: Path to directory containing documents
         Returns:
-            Dictionary with processing results
         """
-        logger.info(f"Loading knowledge base from {data_dir}")
         data_path = Path(data_dir)
-        if not data_path.exists():
-            raise ValueError(f"Data directory {data_dir} does not exist")
-        # Track processing results
         results = {
             "total_files": 0,
             "processed_files": 0,
@@ -95,30 +91,97 @@ class RAGService:
             "errors": []
         }
-        # Find all PDF and MD files
-        pdf_files = list(data_path.glob("**/*.pdf"))
-        md_files = list(data_path.glob("**/*.md"))
-        all_files = pdf_files + md_files
         results["total_files"] = len(all_files)
         if not all_files:
-            logger.warning(f"No PDF or MD files found in {data_dir}")
             return results
-        logger.info(f"Found {len(all_files)} files to process")
-        # Process each file
         for file_path in all_files:
             try:
-                await self._process_file(file_path, results)
             except Exception as e:
-                error_msg = f"Failed to process {file_path}: {str(e)}"
                 logger.error(error_msg)
                 results["errors"].append(error_msg)
                 results["failed_files"] += 1
-        logger.info(f"Knowledge base loading completed: {results}")
         return results
     async def _process_file(self, file_path: Path, results: Dict[str, Any]) -> None:

 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from supabase import create_client, Client
 from langchain_core.documents import Document
+from huggingface_hub import snapshot_download
 from cache import DocumentCache, document_cache, cache_result
 # Configure logging
     async def load_knowledge_base(self, data_dir: str = "backend/data") -> Dict[str, Any]:
         """
         Load and process documents from the data directory.
+        If local directory is empty, download from Hugging Face Dataset.
         Args:
             data_dir: Path to directory containing documents
         Returns:
+            Dictionary with loading statistics
         """
         data_path = Path(data_dir)
         results = {
             "total_files": 0,
             "processed_files": 0,
             "errors": []
         }
+        # 如果本地資料夾不存在或裡面沒有 PDF/MD 檔案，就從 HF Dataset 下載
+        if not data_path.exists() or not any(data_path.glob("*.pdf")) and not any(data_path.glob("*.md")):
+            logger.info("Local knowledge base empty or missing. Downloading from Hugging Face Dataset...")
+            data_path.mkdir(parents=True, exist_ok=True)
+            try:
+                snapshot_download(
+                    repo_id="pcreem/dietinstruction",          # ← 這裡一定要正確！
+                    local_dir=data_dir,
+                    local_dir_use_symlinks=False,
+                    repo_type="dataset",
+                    revision="main",
+                    allow_patterns=["*.pdf", "*.md", "*.txt"],  # 只下載我們需要的檔案
+                    tqdm_class=None  # 避免日誌衝突
+                )
+                logger.info(f"Successfully downloaded knowledge base to {data_dir}")
+            except Exception as e:
+                error_msg = f"Failed to download from Hugging Face Dataset: {str(e)}"
+                logger.error(error_msg)
+                results["errors"].append(error_msg)
+                # 如果下載失敗，至少確保資料夾存在
+                data_path.mkdir(parents=True, exist_ok=True)
+        else:
+            logger.info(f"Using existing local knowledge base at {data_dir}")
+        # ===== 以下是原本的檔案載入邏輯（不需改動太多）=====
+        documents: List[Document] = []
+        # Supported file types
+        pdf_files = list(data_path.glob("*.pdf"))
+        md_files = list(data_path.glob("*.md"))
+        txt_files = list(data_path.glob("*.txt"))
+        all_files = pdf_files + md_files + txt_files
         results["total_files"] = len(all_files)
         if not all_files:
+            logger.warning("No documents found in knowledge base directory")
             return results
         for file_path in all_files:
             try:
+                logger.info(f"Processing file: {file_path.name}")
+                if file_path.suffix == ".pdf":
+                    loader = PyPDFLoader(str(file_path))
+                elif file_path.suffix == ".md":
+                    loader = UnstructuredMarkdownLoader(str(file_path))
+                elif file_path.suffix == ".txt":
+                    # Simple text loader
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        content = f.read()
+                    documents.append(Document(
+                        page_content=content,
+                        metadata={"file_name": file_path.name, "source": str(file_path)}
+                    ))
+                    results["processed_files"] += 1
+                    continue
+                else:
+                    continue
+                docs = loader.load()
+                for doc in docs:
+                    doc.metadata.update({
+                        "file_name": file_path.name,
+                        "source": str(file_path)
+                    })
+                documents.extend(docs)
+                results["processed_files"] += 1
             except Exception as e:
+                error_msg = f"Error processing {file_path.name}: {str(e)}"
                 logger.error(error_msg)
                 results["errors"].append(error_msg)
                 results["failed_files"] += 1
+        # Split documents into chunks
+        if documents:
+            chunks = self.text_splitter.split_documents(documents)
+            results["total_chunks"] = len(chunks)
+            logger.info(f"Created {len(chunks)} document chunks")
+            # Add to vector store (with upsert)
+            try:
+                self.vector_store.add_documents(chunks)
+                logger.info(f"Successfully added {len(chunks)} chunks to vector store")
+            except Exception as e:
+                error_msg = f"Error adding documents to vector store: {str(e)}"
+                logger.error(error_msg)
+                results["errors"].append(error_msg)
+        else:
+            logger.warning("No documents were successfully loaded")
         return results
     async def _process_file(self, file_path: Path, results: Dict[str, Any]) -> None:

requirements.txt CHANGED Viewed

@@ -19,6 +19,7 @@ tiktoken
 supabase
 stripe
 httpx
 # Utilities
 python-dotenv

 supabase
 stripe
 httpx
+huggingface_hub
 # Utilities
 python-dotenv