Update app.py
Browse files
app.py
CHANGED
|
@@ -20,6 +20,7 @@ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
|
|
| 20 |
from langchain_core.documents import Document
|
| 21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 23 |
|
| 24 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
| 25 |
|
|
@@ -142,7 +143,7 @@ _useragent_list = [
|
|
| 142 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
|
| 143 |
]
|
| 144 |
|
| 145 |
-
def google_search(term, num_results=
|
| 146 |
escaped_term = urllib.parse.quote_plus(term)
|
| 147 |
start = 0
|
| 148 |
all_results = []
|
|
@@ -221,6 +222,30 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
| 221 |
|
| 222 |
return all_results
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
| 225 |
global conversation_history
|
| 226 |
|
|
@@ -235,19 +260,19 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
|
| 235 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 236 |
else:
|
| 237 |
database = None
|
| 238 |
-
|
| 239 |
if web_search:
|
| 240 |
search_results = google_search(question)
|
| 241 |
-
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
database.add_documents(web_docs)
|
| 247 |
|
| 248 |
-
|
| 249 |
|
| 250 |
-
context_str = "\n".join([
|
|
|
|
| 251 |
|
| 252 |
prompt_template = """
|
| 253 |
Answer the question based on the following web search results:
|
|
@@ -325,6 +350,48 @@ def update_vectors(files, use_recursive_splitter):
|
|
| 325 |
|
| 326 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
| 327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
def extract_db_to_excel():
|
| 329 |
embed = get_embeddings()
|
| 330 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
|
@@ -384,6 +451,10 @@ with gr.Blocks() as demo:
|
|
| 384 |
|
| 385 |
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
extract_button = gr.Button("Extract Database to Excel")
|
| 388 |
excel_output = gr.File(label="Download Excel File")
|
| 389 |
extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
|
|
|
|
| 20 |
from langchain_core.documents import Document
|
| 21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 23 |
+
from datetime import datetime
|
| 24 |
|
| 25 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
| 26 |
|
|
|
|
| 143 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
|
| 144 |
]
|
| 145 |
|
| 146 |
+
def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl_verify=None):
|
| 147 |
escaped_term = urllib.parse.quote_plus(term)
|
| 148 |
start = 0
|
| 149 |
all_results = []
|
|
|
|
| 222 |
|
| 223 |
return all_results
|
| 224 |
|
| 225 |
+
def summarize_content(content, model):
|
| 226 |
+
summary_prompt = f"""
|
| 227 |
+
Summarize the following content in a concise manner:
|
| 228 |
+
{content}
|
| 229 |
+
Summary:
|
| 230 |
+
"""
|
| 231 |
+
summary = generate_chunked_response(model, summary_prompt, max_tokens=200)
|
| 232 |
+
return summary
|
| 233 |
+
|
| 234 |
+
def rank_search_results(titles, summaries, model):
|
| 235 |
+
ranking_prompt = f"""
|
| 236 |
+
Rank the following search results from a financial analyst perspective.
|
| 237 |
+
Assign a rank from 1 to {len(titles)} based on relevance, with 1 being the most relevant.
|
| 238 |
+
Return only the numeric ranks in order, separated by commas.
|
| 239 |
+
|
| 240 |
+
Titles and summaries:
|
| 241 |
+
{', '.join([f"{i+1}. Title: {title}\nSummary: {summary}" for i, (title, summary) in enumerate(zip(titles, summaries))])}
|
| 242 |
+
|
| 243 |
+
Ranks:
|
| 244 |
+
"""
|
| 245 |
+
ranks_str = generate_chunked_response(model, ranking_prompt)
|
| 246 |
+
ranks = [float(rank.strip()) for rank in ranks_str.split(',')]
|
| 247 |
+
return ranks
|
| 248 |
+
|
| 249 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
| 250 |
global conversation_history
|
| 251 |
|
|
|
|
| 260 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 261 |
else:
|
| 262 |
database = None
|
| 263 |
+
|
| 264 |
if web_search:
|
| 265 |
search_results = google_search(question)
|
| 266 |
+
model = get_model(temperature, top_p, repetition_penalty)
|
| 267 |
|
| 268 |
+
summaries = [summarize_content(result["text"], model) for result in search_results]
|
| 269 |
+
titles = [result["title"] for result in search_results]
|
| 270 |
+
ranks = rank_search_results(titles, summaries, model)
|
|
|
|
| 271 |
|
| 272 |
+
update_vector_db_with_search_results(search_results, summaries, ranks)
|
| 273 |
|
| 274 |
+
context_str = "\n".join([f"Title: {result['title']}\nSummary: {summary}\nRank: {rank}"
|
| 275 |
+
for result, summary, rank in zip(search_results, summaries, ranks)])
|
| 276 |
|
| 277 |
prompt_template = """
|
| 278 |
Answer the question based on the following web search results:
|
|
|
|
| 350 |
|
| 351 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
| 352 |
|
| 353 |
+
def update_vector_db_with_search_results(search_results, summaries, ranks):
|
| 354 |
+
embed = get_embeddings()
|
| 355 |
+
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True) if os.path.exists("faiss_database") else FAISS.from_documents([], embed)
|
| 356 |
+
|
| 357 |
+
current_date = datetime.now().strftime("%Y-%m-%d")
|
| 358 |
+
|
| 359 |
+
for result, summary, rank in zip(search_results, summaries, ranks):
|
| 360 |
+
doc = Document(
|
| 361 |
+
page_content=summary,
|
| 362 |
+
metadata={
|
| 363 |
+
"search_date": current_date,
|
| 364 |
+
"search_title": result["title"],
|
| 365 |
+
"search_content": result["text"],
|
| 366 |
+
"search_summary": summary,
|
| 367 |
+
"rank": rank
|
| 368 |
+
}
|
| 369 |
+
)
|
| 370 |
+
database.add_documents([doc])
|
| 371 |
+
|
| 372 |
+
database.save_local("faiss_database")
|
| 373 |
+
|
| 374 |
+
def export_vector_db_to_excel():
|
| 375 |
+
embed = get_embeddings()
|
| 376 |
+
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 377 |
+
|
| 378 |
+
documents = database.docstore._dict.values()
|
| 379 |
+
data = [{
|
| 380 |
+
"Search Date": doc.metadata["search_date"],
|
| 381 |
+
"Search Title": doc.metadata["search_title"],
|
| 382 |
+
"Search Content": doc.metadata["search_content"],
|
| 383 |
+
"Search Summary": doc.metadata["search_summary"],
|
| 384 |
+
"Rank": doc.metadata["rank"]
|
| 385 |
+
} for doc in documents]
|
| 386 |
+
|
| 387 |
+
df = pd.DataFrame(data)
|
| 388 |
+
|
| 389 |
+
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
| 390 |
+
excel_path = tmp.name
|
| 391 |
+
df.to_excel(excel_path, index=False)
|
| 392 |
+
|
| 393 |
+
return excel_path
|
| 394 |
+
|
| 395 |
def extract_db_to_excel():
|
| 396 |
embed = get_embeddings()
|
| 397 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
|
|
|
| 451 |
|
| 452 |
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
|
| 453 |
|
| 454 |
+
export_vector_db_button = gr.Button("Export Vector DB to Excel")
|
| 455 |
+
vector_db_excel_output = gr.File(label="Download Vector DB Excel File")
|
| 456 |
+
export_vector_db_button.click(export_vector_db_to_excel, inputs=[], outputs=vector_db_excel_output)
|
| 457 |
+
|
| 458 |
extract_button = gr.Button("Extract Database to Excel")
|
| 459 |
excel_output = gr.File(label="Download Excel File")
|
| 460 |
extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
|