from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, Docx2txtLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain.schema import Document from langchain_chroma import Chroma from langchain_community.chat_models import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate import os import tempfile import shutil import atexit TEMP_BASE_FOLDER = tempfile.mkdtemp() def cleanup(): shutil.rmtree(TEMP_BASE_FOLDER) shutil.rmtree("uploads") atexit.register(cleanup) docs_count = 0 def get_unique_filename(): global docs_count docs_count += 1 return f"f_{docs_count}.pdf" def load_document(file_path): if file_path.endswith(".pdf"): return PyMuPDFLoader(file_path=file_path).load() elif file_path.endswith(".txt"): return TextLoader(file_path).load() elif file_path.endswith(".docx"): return Docx2txtLoader(file_path).load() else: raise ValueError("Unsupported file format") def split_text(documents: list[Document]): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150, length_function=len, add_start_index=True) chunks = text_splitter.split_documents(documents) return chunks def save_to_chroma(chunks: list[Document], db_name): CHROMA_PATH = os.path.join(TEMP_BASE_FOLDER, db_name) if os.path.exists(CHROMA_PATH): shutil.rmtree(CHROMA_PATH) db = Chroma.from_documents( chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH ) return db def ingest(file_path, db_name): documents = load_document(file_path) chunks = split_text(documents) save_to_chroma(chunks, db_name) def search(query, db_path): db_dir = os.path.join(TEMP_BASE_FOLDER, db_path) embedding_function = OpenAIEmbeddings() if not os.path.exists(db_dir): return [] db = Chroma(persist_directory=db_dir, embedding_function=embedding_function) return db.similarity_search_with_relevance_scores(query, k=3) def extract_page_numbers(results): sources_with_pages = [] for doc, _ in results: page_number = doc.metadata.get("page", "N/A") sources_with_pages.append(f"p.{page_number+1}") return sources_with_pages PROMPT_TEMPLATE = """ Answer the question based only on the following context: {context} - - Answer the question based on the above context: {question} """ def query_rag(query_text, db_name): results = search(query_text, db_name) if len(results) == 0 or results[0][1] < 0.4: return "No relevant information found.", [] context_text = "\n\n - -\n\n".join([doc.page_content for doc, _ in results]) sources_with_pages = extract_page_numbers(results) prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) prompt = prompt_template.format(context=context_text, question=query_text) model = ChatOpenAI() response_text = model.predict(prompt) return response_text, sources_with_pages from langchain_core.prompts import PromptTemplate CONT_AWARE_QUERY_TEMPLATE = """ You are an RAG prompt generator. Read the chat history and users query and modify the user query to include relevant context from the chat history. Your response should be as small as possible but shouldnt have any missing context. You are a RAG Prompt Generator. You are given a Chat History and a User Query. Your task is to convert User Query into Cntext Aware Query by filling out references from previous history. This Context Aware Query should be understandable without chat history. Keep it simple, short and similar to user query, remove any stopping word. Example: Chat History: User: Who all were the part of this project? Bot: John Doe and Jane Foster. User Query: Tell me more about them? Context Aware Query: about John Doe and Jane Foster Chat History: {history} User Query: {query} Context Aware Query: """ def context_aware_query(history, query): prompt_template = PromptTemplate.from_template(CONT_AWARE_QUERY_TEMPLATE) prompt = prompt_template.format(history=history, query=query) model = ChatOpenAI() cont_awar_query = model.predict(prompt) return cont_awar_query