Spaces:
Runtime error
Runtime error
| from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, Docx2txtLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.schema import Document | |
| from langchain_chroma import Chroma | |
| from langchain_community.chat_models import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| import os | |
| import tempfile | |
| import shutil | |
| import atexit | |
| TEMP_BASE_FOLDER = tempfile.mkdtemp() | |
| def cleanup(): | |
| shutil.rmtree(TEMP_BASE_FOLDER) | |
| shutil.rmtree("uploads") | |
| atexit.register(cleanup) | |
| docs_count = 0 | |
| def get_unique_filename(): | |
| global docs_count | |
| docs_count += 1 | |
| return f"f_{docs_count}.pdf" | |
| def load_document(file_path): | |
| if file_path.endswith(".pdf"): | |
| return PyMuPDFLoader(file_path=file_path).load() | |
| elif file_path.endswith(".txt"): | |
| return TextLoader(file_path).load() | |
| elif file_path.endswith(".docx"): | |
| return Docx2txtLoader(file_path).load() | |
| else: | |
| raise ValueError("Unsupported file format") | |
| def split_text(documents: list[Document]): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150, length_function=len, add_start_index=True) | |
| chunks = text_splitter.split_documents(documents) | |
| return chunks | |
| def save_to_chroma(chunks: list[Document], db_name): | |
| CHROMA_PATH = os.path.join(TEMP_BASE_FOLDER, db_name) | |
| if os.path.exists(CHROMA_PATH): | |
| shutil.rmtree(CHROMA_PATH) | |
| db = Chroma.from_documents( | |
| chunks, | |
| OpenAIEmbeddings(), | |
| persist_directory=CHROMA_PATH | |
| ) | |
| return db | |
| def ingest(file_path, db_name): | |
| documents = load_document(file_path) | |
| chunks = split_text(documents) | |
| save_to_chroma(chunks, db_name) | |
| def search(query, db_path): | |
| db_dir = os.path.join(TEMP_BASE_FOLDER, db_path) | |
| embedding_function = OpenAIEmbeddings() | |
| if not os.path.exists(db_dir): | |
| return [] | |
| db = Chroma(persist_directory=db_dir, embedding_function=embedding_function) | |
| return db.similarity_search_with_relevance_scores(query, k=3) | |
| def extract_page_numbers(results): | |
| sources_with_pages = [] | |
| for doc, _ in results: | |
| page_number = doc.metadata.get("page", "N/A") | |
| sources_with_pages.append(f"p.{page_number+1}") | |
| return sources_with_pages | |
| PROMPT_TEMPLATE = """ | |
| Answer the question based only on the following context: | |
| {context} | |
| - - | |
| Answer the question based on the above context: {question} | |
| """ | |
| def query_rag(query_text, db_name): | |
| results = search(query_text, db_name) | |
| if len(results) == 0 or results[0][1] < 0.4: | |
| return "No relevant information found.", [] | |
| context_text = "\n\n - -\n\n".join([doc.page_content for doc, _ in results]) | |
| sources_with_pages = extract_page_numbers(results) | |
| prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) | |
| prompt = prompt_template.format(context=context_text, question=query_text) | |
| model = ChatOpenAI() | |
| response_text = model.predict(prompt) | |
| return response_text, sources_with_pages | |
| from langchain_core.prompts import PromptTemplate | |
| CONT_AWARE_QUERY_TEMPLATE = """ | |
| You are an RAG prompt generator. | |
| Read the chat history and users query and modify the user query to include relevant context from the chat history. | |
| Your response should be as small as possible but shouldnt have any missing context. | |
| You are a RAG Prompt Generator. | |
| You are given a Chat History and a User Query. Your task is to convert User Query into Cntext Aware Query by filling out references from previous history. | |
| This Context Aware Query should be understandable without chat history. | |
| Keep it simple, short and similar to user query, remove any stopping word. | |
| Example: | |
| Chat History: | |
| User: Who all were the part of this project? | |
| Bot: John Doe and Jane Foster. | |
| User Query: Tell me more about them? | |
| Context Aware Query: about John Doe and Jane Foster | |
| Chat History: | |
| {history} | |
| User Query: | |
| {query} | |
| Context Aware Query: | |
| """ | |
| def context_aware_query(history, query): | |
| prompt_template = PromptTemplate.from_template(CONT_AWARE_QUERY_TEMPLATE) | |
| prompt = prompt_template.format(history=history, query=query) | |
| model = ChatOpenAI() | |
| cont_awar_query = model.predict(prompt) | |
| return cont_awar_query | |