root commited on
Commit
9082fb5
·
1 Parent(s): 138ca2e

new updates

Browse files
Files changed (2) hide show
  1. app.py +61 -65
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,6 +1,14 @@
1
  import gradio as gr
2
  import os
3
 
 
 
 
 
 
 
 
 
4
  from langchain.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.vectorstores import Chroma
@@ -22,8 +30,8 @@ import accelerate
22
 
23
 
24
  # default_persist_directory = './chroma_HF/'
25
- list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mistral-7B-Instruct-v0.1", \
26
- "HuggingFaceH4/zephyr-7b-beta", "meta-llama/Llama-2-7b-chat-hf", "microsoft/phi-2", \
27
  "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "mosaicml/mpt-7b-instruct", "tiiuae/falcon-7b-instruct", \
28
  "google/flan-t5-xxl"
29
  ]
@@ -31,20 +39,45 @@ list_llm_simple = [os.path.basename(llm) for llm in list_llm]
31
 
32
  # Load PDF document and create doc splits
33
  def load_doc(list_file_path, chunk_size, chunk_overlap):
34
- # Processing for one document only
35
- # loader = PyPDFLoader(file_path)
36
- # pages = loader.load()
37
  loaders = [PyPDFLoader(x) for x in list_file_path]
38
  pages = []
39
  for loader in loaders:
40
  pages.extend(loader.load())
41
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
42
  text_splitter = RecursiveCharacterTextSplitter(
43
  chunk_size = chunk_size,
44
  chunk_overlap = chunk_overlap)
45
  doc_splits = text_splitter.split_documents(pages)
46
  return doc_splits
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Create vector database
50
  def create_db(splits, collection_name):
@@ -59,7 +92,6 @@ def create_db(splits, collection_name):
59
  )
60
  return vectordb
61
 
62
-
63
  # Load vector database
64
  def load_db():
65
  embedding = HuggingFaceEmbeddings()
@@ -68,57 +100,18 @@ def load_db():
68
  embedding_function=embedding)
69
  return vectordb
70
 
71
-
72
  # Initialize langchain LLM chain
73
  def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
74
  progress(0.1, desc="Initializing HF tokenizer...")
75
- # HuggingFacePipeline uses local model
76
- # Note: it will download model locally...
77
- # tokenizer=AutoTokenizer.from_pretrained(llm_model)
78
- # progress(0.5, desc="Initializing HF pipeline...")
79
- # pipeline=transformers.pipeline(
80
- # "text-generation",
81
- # model=llm_model,
82
- # tokenizer=tokenizer,
83
- # torch_dtype=torch.bfloat16,
84
- # trust_remote_code=True,
85
- # device_map="auto",
86
- # # max_length=1024,
87
- # max_new_tokens=max_tokens,
88
- # do_sample=True,
89
- # top_k=top_k,
90
- # num_return_sequences=1,
91
- # eos_token_id=tokenizer.eos_token_id
92
- # )
93
- # llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': temperature})
94
-
95
  # HuggingFaceHub uses HF inference endpoints
96
  progress(0.5, desc="Initializing HF Hub...")
97
  # Use of trust_remote_code as model_kwargs
98
- # Warning: langchain issue
99
  # URL: https://github.com/langchain-ai/langchain/issues/6080
100
- if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
101
- llm = HuggingFaceHub(
102
- repo_id=llm_model,
103
- model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "load_in_8bit": True}
104
- )
105
- elif llm_model == "microsoft/phi-2":
106
- raise gr.Error("phi-2 model requires 'trust_remote_code=True', currently not supported by langchain HuggingFaceHub...")
107
- llm = HuggingFaceHub(
108
- repo_id=llm_model,
109
- model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "trust_remote_code": True, "torch_dtype": "auto"}
110
- )
111
- elif llm_model == "TinyLlama/TinyLlama-1.1B-Chat-v1.0":
112
  llm = HuggingFaceHub(
113
  repo_id=llm_model,
114
  model_kwargs={"temperature": temperature, "max_new_tokens": 250, "top_k": top_k}
115
  )
116
- elif llm_model == "meta-llama/Llama-2-7b-chat-hf":
117
- raise gr.Error("Llama-2-7b-chat-hf model requires a Pro subscription...")
118
- llm = HuggingFaceHub(
119
- repo_id=llm_model,
120
- model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k}
121
- )
122
  else:
123
  llm = HuggingFaceHub(
124
  repo_id=llm_model,
@@ -148,30 +141,36 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
148
  progress(0.9, desc="Done!")
149
  return qa_chain
150
 
151
-
152
  # Initialize database
153
- def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
154
  # Create list of documents (when valid)
155
  list_file_path = [x.name for x in list_file_obj if x is not None]
 
 
156
  # Create collection_name for vector database
157
- progress(0.1, desc="Creating collection name...")
158
- collection_name = Path(list_file_path[0]).stem
159
- # Fix potential issues from naming convention
160
- collection_name = collection_name.replace(" ","-")
161
- collection_name = collection_name[:50]
162
- # print('list_file_path: ', list_file_path)
 
 
163
  print('Collection name: ', collection_name)
164
  progress(0.25, desc="Loading document...")
 
165
  # Load document and create splits
166
  doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
 
 
167
  # Create or load vector database
168
  progress(0.5, desc="Generating vector database...")
 
169
  # global vector_db
170
  vector_db = create_db(doc_splits, collection_name)
171
  progress(0.9, desc="Done!")
172
  return vector_db, collection_name, "Complete!"
173
 
174
-
175
  def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
176
  # print("llm_option",llm_option)
177
  llm_name = list_llm[llm_option]
@@ -179,7 +178,6 @@ def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, pr
179
  qa_chain = initialize_llmchain(llm_name, llm_temperature, max_tokens, top_k, vector_db, progress)
180
  return qa_chain, "Complete!"
181
 
182
-
183
  def format_chat_history(message, chat_history):
184
  formatted_chat_history = []
185
  for user_message, bot_message in chat_history:
@@ -187,7 +185,6 @@ def format_chat_history(message, chat_history):
187
  formatted_chat_history.append(f"Assistant: {bot_message}")
188
  return formatted_chat_history
189
 
190
-
191
  def conversation(qa_chain, message, history):
192
  formatted_chat_history = format_chat_history(message, history)
193
  #print("formatted_chat_history",formatted_chat_history)
@@ -209,7 +206,6 @@ def conversation(qa_chain, message, history):
209
  # return gr.update(value=""), new_history, response_sources[0], response_sources[1]
210
  return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page
211
 
212
-
213
  def upload_file(file_obj):
214
  list_file_path = []
215
  for idx, file in enumerate(file_obj):
@@ -227,16 +223,16 @@ def demo():
227
  collection_name = gr.State()
228
 
229
  gr.Markdown(
230
- """<center><h2>PDF-based chatbot (powered by LangChain and open-source LLMs)</center></h2>
231
- <h3>Ask any questions about your PDF documents, along with follow-ups</h3>
232
- <b>Note:</b> This AI assistant performs retrieval-augmented generation from your PDF documents. \
233
  When generating answers, it takes past questions into account (via conversational memory), and includes document references for clarity purposes.</i>
234
  <br><b>Warning:</b> This space uses the free CPU Basic hardware from Hugging Face. Some steps and LLM models used below (free inference endpoints) can take some time to generate an output.<br>
235
  """)
236
  with gr.Tab("Step 1 - Document pre-processing"):
237
  with gr.Row():
238
  document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
239
- # upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)
240
  with gr.Row():
241
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
242
  with gr.Accordion("Advanced options - Document text splitter", open=False):
@@ -247,7 +243,7 @@ def demo():
247
  with gr.Row():
248
  db_progress = gr.Textbox(label="Vector database initialization", value="None")
249
  with gr.Row():
250
- db_btn = gr.Button("Generate vector database...")
251
 
252
  with gr.Tab("Step 2 - QA chain initialization"):
253
  with gr.Row():
@@ -283,7 +279,7 @@ def demo():
283
  # Preprocessing events
284
  #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
285
  db_btn.click(initialize_database, \
286
- inputs=[document, slider_chunk_size, slider_chunk_overlap], \
287
  outputs=[vector_db, collection_name, db_progress])
288
  qachain_btn.click(initialize_LLM, \
289
  inputs=[llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db], \
 
1
  import gradio as gr
2
  import os
3
 
4
+
5
+ import string
6
+ import random
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from datetime import datetime
10
+
11
+
12
  from langchain.document_loaders import PyPDFLoader
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain.vectorstores import Chroma
 
30
 
31
 
32
  # default_persist_directory = './chroma_HF/'
33
+ list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mistral-7B-Instruct-v0.1", \
34
+ "HuggingFaceH4/zephyr-7b-beta", "NousResearch/Llama-2-7b-chat-hf", \
35
  "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "mosaicml/mpt-7b-instruct", "tiiuae/falcon-7b-instruct", \
36
  "google/flan-t5-xxl"
37
  ]
 
39
 
40
  # Load PDF document and create doc splits
41
  def load_doc(list_file_path, chunk_size, chunk_overlap):
 
 
 
42
  loaders = [PyPDFLoader(x) for x in list_file_path]
43
  pages = []
44
  for loader in loaders:
45
  pages.extend(loader.load())
 
46
  text_splitter = RecursiveCharacterTextSplitter(
47
  chunk_size = chunk_size,
48
  chunk_overlap = chunk_overlap)
49
  doc_splits = text_splitter.split_documents(pages)
50
  return doc_splits
51
 
52
+ def convert_github_url_to_raw(url):
53
+ # Ensure the URL is a GitHub blob URL
54
+ if "github.com" in url and "/blob/" in url:
55
+ raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob", "")
56
+ response = requests.get(raw_url)
57
+ html_content = response.text
58
+ # Step 2: Find the GitHub Icon and Extract the Link
59
+ soup = BeautifulSoup(html_content, "html.parser")
60
+ github_icon_link = None
61
+ for a in soup.find_all('a', href=True):
62
+ if "github.com" in a['href']: # Assuming the GitHub link contains "github.com"
63
+ github_icon_link = a['href']
64
+ break
65
+ markdown_url = convert_github_url_to_raw(github_icon_link)
66
+ response = requests.get(markdown_url)
67
+ return response
68
+ else:
69
+ return ''
70
+
71
+ def load_url(list_url_path, chunk_size, chunk_overlap):
72
+ texts = [convert_github_url_to_raw(x) for x in list_url_path]
73
+ pages = []
74
+ for text in texts:
75
+ pages.extend(text)
76
+ text_splitter = RecursiveCharacterTextSplitter(
77
+ chunk_size = chunk_size,
78
+ chunk_overlap = chunk_overlap)
79
+ doc_splits = text_splitter.split_documents(pages)
80
+ return doc_splits
81
 
82
  # Create vector database
83
  def create_db(splits, collection_name):
 
92
  )
93
  return vectordb
94
 
 
95
  # Load vector database
96
  def load_db():
97
  embedding = HuggingFaceEmbeddings()
 
100
  embedding_function=embedding)
101
  return vectordb
102
 
 
103
  # Initialize langchain LLM chain
104
  def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
105
  progress(0.1, desc="Initializing HF tokenizer...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # HuggingFaceHub uses HF inference endpoints
107
  progress(0.5, desc="Initializing HF Hub...")
108
  # Use of trust_remote_code as model_kwargs
 
109
  # URL: https://github.com/langchain-ai/langchain/issues/6080
110
+ if llm_model == "TinyLlama/TinyLlama-1.1B-Chat-v1.0":
 
 
 
 
 
 
 
 
 
 
 
111
  llm = HuggingFaceHub(
112
  repo_id=llm_model,
113
  model_kwargs={"temperature": temperature, "max_new_tokens": 250, "top_k": top_k}
114
  )
 
 
 
 
 
 
115
  else:
116
  llm = HuggingFaceHub(
117
  repo_id=llm_model,
 
141
  progress(0.9, desc="Done!")
142
  return qa_chain
143
 
 
144
  # Initialize database
145
+ def initialize_database(list_file_obj, input_urls, chunk_size, chunk_overlap, progress=gr.Progress()):
146
  # Create list of documents (when valid)
147
  list_file_path = [x.name for x in list_file_obj if x is not None]
148
+ list_url = [x for x in input_urls if x is not None]
149
+
150
  # Create collection_name for vector database
151
+ progress(0.1, desc="Creating collection...")
152
+ # collection_name = Path(list_file_path[0]).stem
153
+
154
+ # # Fix potential issues from naming convention
155
+ # collection_name = collection_name.replace(" ","-")
156
+ # collection_name = collection_name[:50]
157
+ res = ''.join(random.choices(string.ascii_letters, k=10))
158
+ collection_name = f"HuggingFace101_{res}"
159
  print('Collection name: ', collection_name)
160
  progress(0.25, desc="Loading document...")
161
+
162
  # Load document and create splits
163
  doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
164
+ print(type(doc_splits))
165
+
166
  # Create or load vector database
167
  progress(0.5, desc="Generating vector database...")
168
+
169
  # global vector_db
170
  vector_db = create_db(doc_splits, collection_name)
171
  progress(0.9, desc="Done!")
172
  return vector_db, collection_name, "Complete!"
173
 
 
174
  def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
175
  # print("llm_option",llm_option)
176
  llm_name = list_llm[llm_option]
 
178
  qa_chain = initialize_llmchain(llm_name, llm_temperature, max_tokens, top_k, vector_db, progress)
179
  return qa_chain, "Complete!"
180
 
 
181
  def format_chat_history(message, chat_history):
182
  formatted_chat_history = []
183
  for user_message, bot_message in chat_history:
 
185
  formatted_chat_history.append(f"Assistant: {bot_message}")
186
  return formatted_chat_history
187
 
 
188
  def conversation(qa_chain, message, history):
189
  formatted_chat_history = format_chat_history(message, history)
190
  #print("formatted_chat_history",formatted_chat_history)
 
206
  # return gr.update(value=""), new_history, response_sources[0], response_sources[1]
207
  return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page
208
 
 
209
  def upload_file(file_obj):
210
  list_file_path = []
211
  for idx, file in enumerate(file_obj):
 
223
  collection_name = gr.State()
224
 
225
  gr.Markdown(
226
+ """<center><h2>HugginFace Articles URL-based chatbot (powered by LangChain and open-source LLMs)</center></h2>
227
+ <h3>Ask any questions about your Huggingface Articles, along with follow-ups</h3>
228
+ <b>Note:</b> This AI assistant performs retrieval-augmented generation from Huggingface Articles. \
229
  When generating answers, it takes past questions into account (via conversational memory), and includes document references for clarity purposes.</i>
230
  <br><b>Warning:</b> This space uses the free CPU Basic hardware from Hugging Face. Some steps and LLM models used below (free inference endpoints) can take some time to generate an output.<br>
231
  """)
232
  with gr.Tab("Step 1 - Document pre-processing"):
233
  with gr.Row():
234
  document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
235
+ input_url = gr.Textbox(label="Or enter a URL", placeholder="https://example.com")
236
  with gr.Row():
237
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
238
  with gr.Accordion("Advanced options - Document text splitter", open=False):
 
243
  with gr.Row():
244
  db_progress = gr.Textbox(label="Vector database initialization", value="None")
245
  with gr.Row():
246
+ db_btn = gr.Button("Generating vector database...")
247
 
248
  with gr.Tab("Step 2 - QA chain initialization"):
249
  with gr.Row():
 
279
  # Preprocessing events
280
  #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
281
  db_btn.click(initialize_database, \
282
+ inputs=[document, input_url, slider_chunk_size, slider_chunk_overlap], \
283
  outputs=[vector_db, collection_name, db_progress])
284
  qachain_btn.click(initialize_LLM, \
285
  inputs=[llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db], \
requirements.txt CHANGED
@@ -6,4 +6,4 @@ tqdm
6
  accelerate
7
  pypdf
8
  chromadb
9
-
 
6
  accelerate
7
  pypdf
8
  chromadb
9
+ bs4