Spaces:

InnovisionLLC
/

example_test

Paused

App Files Files Community

Wenye He commited on Feb 18, 2025

Commit

78522bd

verified ·

1 Parent(s): 94e1454

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -36

app.py CHANGED Viewed

@@ -10,18 +10,30 @@ from langchain_community.vectorstores import FAISS
 # Document processing function
 def process_documents(files):
     documents = []
-    for file in files:
-        if file.name.endswith(".pdf"):
-            loader = PyPDFLoader(file.name)
-        elif file.name.endswith(".txt"):
-            loader = TextLoader(file.name)
         documents.extend(loader.load())
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
     texts = text_splitter.split_documents(documents)
-    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
     vectorstore = FAISS.from_documents(texts, embeddings)
     return vectorstore
@@ -50,37 +62,31 @@ class ChatModel:
     def __init__(self):
         self.models = {}
         self.tokenizers = {}
-    def load_model(self, model_name):
-        if model_name not in self.models:
-            config = MODEL_CONFIG[model_name]
-            tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
-            tokenizer.pad_token = tokenizer.eos_token
-            model = AutoModelForCausalLM.from_pretrained(
-                config["model_name"],
-                quantization_config=bnb_config,
-                device_map="auto",
-                torch_dtype=torch.float16,
-            )
-            self.models[model_name] = model
-            self.tokenizers[model_name] = tokenizer
-    def generate(self, message, model_name, history, vectorstore=None):
-        # RAG context retrieval
-        if vectorstore:
-            docs = vectorstore.similarity_search(message, k=3)
-            context = "\n".join([d.page_content for d in docs])
-            message = f"Context: {context}\n\nQuestion: {message}"
-        start_time = time.time()  # Start timing
         self.load_model(model_name)
         config = MODEL_CONFIG[model_name]
-        # Format prompt
-        prompt = config["template"].format(message=message)
         # Create pipeline
         pipe = pipeline(
@@ -119,11 +125,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     # Add document upload section
     with gr.Row():
-        file_output = gr.File(
-            label="Upload Documents",
             file_count="multiple",
             file_types=[".pdf", ".txt"],
-            type="filepath"  # Explicitly specify type from [5]
         )
     with gr.Row():
         model_choice = gr.Dropdown(
@@ -140,4 +146,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     msg.submit(chat, [msg, chatbot, model_choice], chatbot)
     submit_btn.click(chat, [msg, chatbot, model_choice], chatbot)
 demo.launch()

 # Document processing function
 def process_documents(files):
+    """Process PDF/TXT files into vector embeddings"""
     documents = []
+    for file_path in files:
+        if file_path.endswith(".pdf"):
+            loader = PyPDFLoader(file_path)
+        elif file_path.endswith(".txt"):
+            loader = TextLoader(file_path)
+        else:
+            continue
         documents.extend(loader.load())
+    # Split documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=512,
+        chunk_overlap=50
+    )
     texts = text_splitter.split_documents(documents)
+    # Create embeddings
+    embeddings = HuggingFaceEmbeddings(
+        model_name="BAAI/bge-small-en-v1.5"
+    )
+    # Create vector store
     vectorstore = FAISS.from_documents(texts, embeddings)
     return vectorstore
     def __init__(self):
         self.models = {}
         self.tokenizers = {}
+        self.vectorstore = None  # Add vectorstore reference
+    # Add this new method
+    def update_vectorstore(self, files):
+        """Process uploaded files and update vectorstore"""
+        if files:
+            self.vectorstore = process_documents(files)
+    # Modify existing generate method
+    def generate(self, message, model_name, history):
+        start_time = time.time()
+        # Retrieve relevant context
+        context = ""
+        if self.vectorstore:
+            docs = self.vectorstore.similarity_search(message, k=3)
+            context = "\n".join([d.page_content for d in docs])
         self.load_model(model_name)
         config = MODEL_CONFIG[model_name]
+        # Update prompt with context
+        prompt = config["template"].format(
+            message=f"Context: {context}\n\nQuestion: {message}"
+        )
         # Create pipeline
         pipe = pipeline(
     # Add document upload section
     with gr.Row():
+        file_upload = gr.File(
+            label="Upload Documents (PDF/TXT)",
             file_count="multiple",
             file_types=[".pdf", ".txt"],
+            type="filepath"
         )
     with gr.Row():
         model_choice = gr.Dropdown(
     msg.submit(chat, [msg, chatbot, model_choice], chatbot)
     submit_btn.click(chat, [msg, chatbot, model_choice], chatbot)
+    file_upload.upload(
+        fn=model_handler.update_vectorstore,
+        inputs=file_upload,
+        outputs=None
+    )
 demo.launch()