Spaces:
Sleeping
Sleeping
| import os | |
| import glob | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| # Load environment variables | |
| load_dotenv() | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "data") | |
| DB_PATH = os.path.join(BASE_DIR, "vector_db") | |
| def load_documents(): | |
| documents = [] | |
| pdf_files = glob.glob(os.path.join(DATA_PATH, "*.pdf")) | |
| if not pdf_files: | |
| print(f"No PDF files found in {DATA_PATH}") | |
| return [] | |
| print(f"Found {len(pdf_files)} PDF files.") | |
| for pdf_file in pdf_files: | |
| print(f"Loading {pdf_file}...") | |
| try: | |
| loader = PyPDFLoader(pdf_file) | |
| docs = loader.load() | |
| documents.extend(docs) | |
| except Exception as e: | |
| print(f"Error loading {pdf_file}: {e}") | |
| return documents | |
| def split_documents(documents): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len, | |
| add_start_index=True, | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Split {len(documents)} documents into {len(chunks)} chunks.") | |
| return chunks | |
| def save_to_faiss(chunks): | |
| embeddings = OpenAIEmbeddings() | |
| print("Creating vector database...") | |
| db = FAISS.from_documents(chunks, embeddings) | |
| db.save_local(DB_PATH) | |
| print(f"Saved {len(chunks)} chunks to {DB_PATH}.") | |
| def main(): | |
| documents = load_documents() | |
| if not documents: | |
| return | |
| chunks = split_documents(documents) | |
| save_to_faiss(chunks) | |
| if __name__ == "__main__": | |
| main() | |