Spaces:
Sleeping
Sleeping
| from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from os.path import join | |
| import os | |
| from dotenv import load_dotenv | |
| # load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env') | |
| openai_api_key = os.environ.get('OPENAI_API_KEY') | |
| from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader | |
| from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader,UnstructuredWordDocumentLoader | |
| # def load_documents(file_path): | |
| # if file_path.endswith('.txt'): | |
| # loader = TextLoader(file_path) | |
| # elif file_path.endswith('.pdf'): | |
| # loader = PyPDFLoader(file_path) | |
| # elif file_path.endswith('.doc') or file_path.endswith('.docx'): | |
| # loader = UnstructuredWordDocumentLoader(file_path) | |
| # elif file_path.endswith('.csv'): | |
| # loader = CSVLoader(file_path) | |
| # else: | |
| # raise ValueError(f"Unsupported file format: {file_path}") | |
| # documents = loader.load() | |
| # return documents | |
| from fastapi import UploadFile | |
| from typing import List | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import docx | |
| import tempfile | |
| from langchain.docstore.document import Document | |
| def read_pdf(file_path: str) -> str: | |
| loader=PyMuPDFLoader(file_path) | |
| text=loader.load() | |
| return text | |
| def read_docx(file_path: str) -> str: | |
| loader=UnstructuredWordDocumentLoader(file_path) | |
| text=loader.load() | |
| return text | |
| def read_csv(file_path: str) -> str: | |
| loader=CSVLoader(file_path) | |
| data=loader.load() | |
| return data | |
| def read_txt(file_path: str) -> str: | |
| loader=TextLoader(file_path) | |
| text=loader.load() | |
| return text | |
| async def load_documents(file: UploadFile) -> List[Document]: | |
| temp_file_path = f"temp_{file.filename}" | |
| try: | |
| # Save the uploaded file to a temporary file | |
| with open(temp_file_path, "wb") as temp_file: | |
| contents = await file.read() # Read the content of the uploaded file | |
| temp_file.write(contents) # Write the content to the temporary file | |
| # Now you can pass temp_file_path to your read functions | |
| content = "" | |
| if file.filename.endswith('.pdf'): | |
| content = read_pdf(temp_file_path) # Pass the path, not the file object | |
| elif file.filename.endswith('.docx'): | |
| content = read_docx(temp_file_path) | |
| elif file.filename.endswith('.csv'): | |
| content = read_csv(temp_file_path) | |
| elif file.filename.endswith('.txt'): | |
| content = read_txt(temp_file_path) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| except Exception as e: | |
| print(f"Error processing document: {e}") | |
| content = "Error processing document." | |
| finally: | |
| if os.path.exists(temp_file_path): | |
| os.remove(temp_file_path) # Clean up the temporary file | |
| # metadata = {'source': file.filename} | |
| # document = Document(page_content=content, metadata=metadata) | |
| return content | |
| from langchain.text_splitter import CharacterTextSplitter | |
| def chunk_documents(documents, chunk_size, chunk_overlap): | |
| text_splitter = CharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| chunked_docs = text_splitter.split_documents(documents) | |
| return chunked_docs | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| def create_embeddings(chunked_docs, collection_name): | |
| embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
| vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name) | |
| vector_store.persist() | |
| return vector_store |