cbio-vec / app.py
jim-bo's picture
initial commit
56689a3
import gradio as gr
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
# Create the pdfs directory if it doesn't exist
if not os.path.exists("pdfs"):
os.makedirs("pdfs")
def get_pdf_files():
"""Gets the list of PDF files from the 'pdfs' directory."""
return [f for f in os.listdir("pdfs") if f.endswith(".pdf")]
def index_pdfs():
"""Indexes the PDF files in the 'pdfs' directory."""
pdf_files = get_pdf_files()
if not pdf_files:
return "No PDF files found in the 'pdfs' directory."
success_files = []
failed_files = []
for pdf_file in pdf_files:
try:
file_path = os.path.join("pdfs", pdf_file)
if os.path.getsize(file_path) == 0:
failed_files.append(f"{pdf_file} (file is empty)")
continue
loader = PyPDFLoader(file_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings, persist_directory="./db")
db.persist()
success_files.append(pdf_file)
except Exception as e:
failed_files.append(f"{pdf_file} (Error: {e})")
status = ""
if success_files:
status += f"Successfully indexed: {', '.join(success_files)}\n"
if failed_files:
status += f"Failed to index: {', '.join(failed_files)}"
return status if status else "No files were processed."
def search(query):
"""Searches the indexed PDFs for the given query."""
embeddings = OpenAIEmbeddings()
db = Chroma(persist_directory="./db", embedding_function=embeddings)
docs = db.similarity_search(query)
results = ""
for doc in docs:
results += f"Source: {doc.metadata['source']}\n"
results += f"Content: {doc.page_content}\n\n"
return results
with gr.Blocks() as demo:
gr.Markdown("# Simple Semantic Search App")
with gr.Tab("Index PDFs"):
pdf_files_display = gr.Textbox(label="Available PDF Files", interactive=False, value="\n".join(get_pdf_files()))
index_button = gr.Button("Index PDFs")
index_status = gr.Textbox(label="Indexing Status", interactive=False)
index_button.click(index_pdfs, inputs=None, outputs=index_status)
with gr.Tab("Search"):
search_query = gr.Textbox(label="Search Query")
search_button = gr.Button("Search")
search_results = gr.Textbox(label="Search Results", interactive=False)
search_button.click(search, inputs=search_query, outputs=search_results)
if __name__ == "__main__":
demo.launch()