Wenye He commited on
Commit
78522bd
·
verified ·
1 Parent(s): 94e1454

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -36
app.py CHANGED
@@ -10,18 +10,30 @@ from langchain_community.vectorstores import FAISS
10
 
11
  # Document processing function
12
  def process_documents(files):
 
13
  documents = []
14
- for file in files:
15
- if file.name.endswith(".pdf"):
16
- loader = PyPDFLoader(file.name)
17
- elif file.name.endswith(".txt"):
18
- loader = TextLoader(file.name)
 
 
19
  documents.extend(loader.load())
20
 
21
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
 
 
 
 
22
  texts = text_splitter.split_documents(documents)
23
 
24
- embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
 
 
 
 
 
25
  vectorstore = FAISS.from_documents(texts, embeddings)
26
  return vectorstore
27
 
@@ -50,37 +62,31 @@ class ChatModel:
50
  def __init__(self):
51
  self.models = {}
52
  self.tokenizers = {}
53
-
54
- def load_model(self, model_name):
55
- if model_name not in self.models:
56
- config = MODEL_CONFIG[model_name]
57
-
58
- tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
59
- tokenizer.pad_token = tokenizer.eos_token
60
-
61
- model = AutoModelForCausalLM.from_pretrained(
62
- config["model_name"],
63
- quantization_config=bnb_config,
64
- device_map="auto",
65
- torch_dtype=torch.float16,
66
- )
67
-
68
- self.models[model_name] = model
69
- self.tokenizers[model_name] = tokenizer
70
 
71
- def generate(self, message, model_name, history, vectorstore=None):
72
- # RAG context retrieval
73
- if vectorstore:
74
- docs = vectorstore.similarity_search(message, k=3)
75
- context = "\n".join([d.page_content for d in docs])
76
- message = f"Context: {context}\n\nQuestion: {message}"
77
 
78
- start_time = time.time() # Start timing
 
 
 
 
 
 
 
 
 
79
  self.load_model(model_name)
80
  config = MODEL_CONFIG[model_name]
81
 
82
- # Format prompt
83
- prompt = config["template"].format(message=message)
 
 
84
 
85
  # Create pipeline
86
  pipe = pipeline(
@@ -119,11 +125,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
119
 
120
  # Add document upload section
121
  with gr.Row():
122
- file_output = gr.File(
123
- label="Upload Documents",
124
  file_count="multiple",
125
  file_types=[".pdf", ".txt"],
126
- type="filepath" # Explicitly specify type from [5]
127
  )
128
  with gr.Row():
129
  model_choice = gr.Dropdown(
@@ -140,4 +146,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
140
  msg.submit(chat, [msg, chatbot, model_choice], chatbot)
141
  submit_btn.click(chat, [msg, chatbot, model_choice], chatbot)
142
 
 
 
 
 
 
 
143
  demo.launch()
 
10
 
11
  # Document processing function
12
  def process_documents(files):
13
+ """Process PDF/TXT files into vector embeddings"""
14
  documents = []
15
+ for file_path in files:
16
+ if file_path.endswith(".pdf"):
17
+ loader = PyPDFLoader(file_path)
18
+ elif file_path.endswith(".txt"):
19
+ loader = TextLoader(file_path)
20
+ else:
21
+ continue
22
  documents.extend(loader.load())
23
 
24
+ # Split documents into chunks
25
+ text_splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size=512,
27
+ chunk_overlap=50
28
+ )
29
  texts = text_splitter.split_documents(documents)
30
 
31
+ # Create embeddings
32
+ embeddings = HuggingFaceEmbeddings(
33
+ model_name="BAAI/bge-small-en-v1.5"
34
+ )
35
+
36
+ # Create vector store
37
  vectorstore = FAISS.from_documents(texts, embeddings)
38
  return vectorstore
39
 
 
62
  def __init__(self):
63
  self.models = {}
64
  self.tokenizers = {}
65
+ self.vectorstore = None # Add vectorstore reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # Add this new method
68
+ def update_vectorstore(self, files):
69
+ """Process uploaded files and update vectorstore"""
70
+ if files:
71
+ self.vectorstore = process_documents(files)
 
72
 
73
+ # Modify existing generate method
74
+ def generate(self, message, model_name, history):
75
+ start_time = time.time()
76
+
77
+ # Retrieve relevant context
78
+ context = ""
79
+ if self.vectorstore:
80
+ docs = self.vectorstore.similarity_search(message, k=3)
81
+ context = "\n".join([d.page_content for d in docs])
82
+
83
  self.load_model(model_name)
84
  config = MODEL_CONFIG[model_name]
85
 
86
+ # Update prompt with context
87
+ prompt = config["template"].format(
88
+ message=f"Context: {context}\n\nQuestion: {message}"
89
+ )
90
 
91
  # Create pipeline
92
  pipe = pipeline(
 
125
 
126
  # Add document upload section
127
  with gr.Row():
128
+ file_upload = gr.File(
129
+ label="Upload Documents (PDF/TXT)",
130
  file_count="multiple",
131
  file_types=[".pdf", ".txt"],
132
+ type="filepath"
133
  )
134
  with gr.Row():
135
  model_choice = gr.Dropdown(
 
146
  msg.submit(chat, [msg, chatbot, model_choice], chatbot)
147
  submit_btn.click(chat, [msg, chatbot, model_choice], chatbot)
148
 
149
+ file_upload.upload(
150
+ fn=model_handler.update_vectorstore,
151
+ inputs=file_upload,
152
+ outputs=None
153
+ )
154
+
155
  demo.launch()