Update app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,6 @@ st.set_page_config(
|
|
| 19 |
layout="wide"
|
| 20 |
)
|
| 21 |
|
| 22 |
-
st.title("دستیارهوشمند ارتش ")
|
| 23 |
|
| 24 |
st.markdown("""
|
| 25 |
<style>
|
|
@@ -463,162 +462,98 @@ st.markdown("""
|
|
| 463 |
</style>
|
| 464 |
""", unsafe_allow_html=True)
|
| 465 |
|
| 466 |
-
import
|
| 467 |
-
import
|
| 468 |
-
import docx
|
| 469 |
import streamlit as st
|
| 470 |
-
import
|
| 471 |
-
from hazm import Normalizer
|
| 472 |
-
from rapidfuzz import fuzz
|
| 473 |
from langchain.schema import SystemMessage, HumanMessage
|
| 474 |
from langchain.chat_models import ChatOpenAI
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
]
|
| 510 |
-
def remove_stop_words(text, stop_words):
|
| 511 |
-
words = text.split()
|
| 512 |
-
return " ".join([word for word in words if word not in stop_words])
|
| 513 |
-
|
| 514 |
-
def extract_keywords_from_text(text, query_words):
|
| 515 |
-
matched_lines = []
|
| 516 |
-
lines = text.split("\n")
|
| 517 |
-
for line in lines:
|
| 518 |
-
if any(query_word in line for query_word in query_words):
|
| 519 |
-
matched_lines.append(line)
|
| 520 |
-
return matched_lines
|
| 521 |
|
| 522 |
def clean_text(text):
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
| 526 |
-
cleaned_query = remove_stop_words(query, stop_words)
|
| 527 |
-
query_words = cleaned_query.split()
|
| 528 |
-
all_matched_lines = []
|
| 529 |
-
|
| 530 |
-
for filename, text in doc_texts.items():
|
| 531 |
-
matched_lines = extract_keywords_from_text(text, query_words)
|
| 532 |
-
for line in matched_lines:
|
| 533 |
-
similarity = fuzz.partial_ratio(query, line)
|
| 534 |
-
all_matched_lines.append((line, similarity))
|
| 535 |
-
|
| 536 |
-
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
| 537 |
-
return [line for line, _ in all_matched_lines[:top_n]]
|
| 538 |
-
|
| 539 |
-
def remove_stop_words_from_lines(lines, stop_words):
|
| 540 |
-
cleaned_lines = []
|
| 541 |
-
for line in lines:
|
| 542 |
-
words = line.split()
|
| 543 |
-
cleaned_words = [word for word in words if word not in stop_words]
|
| 544 |
-
cleaned_lines.append(" ".join(cleaned_words))
|
| 545 |
-
return cleaned_lines
|
| 546 |
|
| 547 |
-
st.markdown("""
|
| 548 |
-
<style>
|
| 549 |
-
/* تنظیمات برای بالا بردن موقعیت input و ضخیمتر کردن فونت */
|
| 550 |
-
div[data-baseweb="input"] {
|
| 551 |
-
margin-top: 1px !important; /* فاصله از بالا (کم کن یا زیاد کن به دلخواه) */
|
| 552 |
-
font-weight: 800 !important; /* فونت کلفت */
|
| 553 |
-
font-size: 22px !important; /* اندازه فونت بزرگتر */
|
| 554 |
-
font-family: "Vazir", sans-serif !important; /* اگر فونت فارسی دادی */
|
| 555 |
-
direction: rtl !important; /* راست به چپ */
|
| 556 |
-
text-align: right !important; /* متن راست چین */
|
| 557 |
-
}
|
| 558 |
-
</style>
|
| 559 |
-
""", unsafe_allow_html=True)
|
| 560 |
|
| 561 |
-
query = st.
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
if query:
|
| 564 |
-
|
| 565 |
thinking = st.empty()
|
| 566 |
-
thinking.markdown(""
|
| 567 |
-
<div style="background-color:#0d4d31;padding:10px;border-radius:10px;">
|
| 568 |
-
⏳ در حال فکر کردن...
|
| 569 |
-
</div>
|
| 570 |
-
""", unsafe_allow_html=True)
|
| 571 |
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
| 574 |
|
| 575 |
-
|
| 576 |
prompt = f"""
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
پاسخ باید نهایی، روان، و در حدود 512 تا 2048 کاراکتر باشد.
|
| 582 |
-
مستقیماً پاسخ را بنویس و هیچ توضیحی درباره نحوه رسیدن به پاسخ نده.
|
| 583 |
-
|
| 584 |
-
سوال:
|
| 585 |
{query}
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
{
|
| 589 |
-
|
| 590 |
پاسخ نهایی:
|
| 591 |
"""
|
|
|
|
| 592 |
response = llm([
|
| 593 |
SystemMessage(
|
| 594 |
-
|
| 595 |
),
|
| 596 |
HumanMessage(content=prompt)
|
| 597 |
])
|
| 598 |
final_answer = clean_text(response.content.strip())
|
| 599 |
-
|
| 600 |
-
|
|
|
|
| 601 |
|
| 602 |
thinking.empty()
|
| 603 |
|
| 604 |
st.session_state.chat_history.append(("🧑", query))
|
| 605 |
st.session_state.chat_history.append(("🤖", final_answer))
|
| 606 |
|
| 607 |
-
|
| 608 |
-
<style>
|
| 609 |
-
@import url('https://cdn.fontcdn.ir/Font/Persian/Vazir/Vazir.css');
|
| 610 |
-
div.chat-message {
|
| 611 |
-
font-family: 'Vazir', sans-serif;
|
| 612 |
-
font-size: 16px;
|
| 613 |
-
color: white;
|
| 614 |
-
background-color: #0d4d31;
|
| 615 |
-
padding: 10px;
|
| 616 |
-
border-radius: 10px;
|
| 617 |
-
margin-bottom: 5px;
|
| 618 |
-
}
|
| 619 |
-
</style>
|
| 620 |
-
""", unsafe_allow_html=True)
|
| 621 |
-
|
| 622 |
st.markdown("---")
|
| 623 |
for sender, message in st.session_state.chat_history:
|
| 624 |
-
st.markdown(f'<div
|
|
|
|
| 19 |
layout="wide"
|
| 20 |
)
|
| 21 |
|
|
|
|
| 22 |
|
| 23 |
st.markdown("""
|
| 24 |
<style>
|
|
|
|
| 462 |
</style>
|
| 463 |
""", unsafe_allow_html=True)
|
| 464 |
|
| 465 |
+
import json
|
| 466 |
+
import requests
|
|
|
|
| 467 |
import streamlit as st
|
| 468 |
+
import numpy as np
|
|
|
|
|
|
|
| 469 |
from langchain.schema import SystemMessage, HumanMessage
|
| 470 |
from langchain.chat_models import ChatOpenAI
|
| 471 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 472 |
+
|
| 473 |
+
EMBEDDING_FILE = "embeddings.json"
|
| 474 |
+
EMBEDDING_MODEL = "intfloat/multilingual-e5-large-instruct"
|
| 475 |
+
TOGETHER_API_KEY = "333ac33f5be91819cb7ade101134d73f5e63d299a964ae290850eeac5d82a8d5"
|
| 476 |
+
|
| 477 |
+
@st.cache_data
|
| 478 |
+
def load_embeddings(file_path):
|
| 479 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 480 |
+
return json.load(f)
|
| 481 |
+
|
| 482 |
+
def get_query_embedding_together(query):
|
| 483 |
+
url = "https://api.together.xyz/v1/embeddings"
|
| 484 |
+
headers = {
|
| 485 |
+
"Authorization": f"Bearer {TOGETHER_API_KEY}",
|
| 486 |
+
"accept": "application/json",
|
| 487 |
+
"content-type": "application/json"
|
| 488 |
+
}
|
| 489 |
+
payload = {
|
| 490 |
+
"model": EMBEDDING_MODEL,
|
| 491 |
+
"input": query
|
| 492 |
+
}
|
| 493 |
+
response = requests.post(url, headers=headers, json=payload)
|
| 494 |
+
response.raise_for_status()
|
| 495 |
+
return response.json()["data"][0]["embedding"]
|
| 496 |
+
|
| 497 |
+
def find_most_similar_chunks(query_embedding, data, top_n=3):
|
| 498 |
+
query_vec = np.array(query_embedding).reshape(1, -1)
|
| 499 |
+
similarities = []
|
| 500 |
+
for item in data:
|
| 501 |
+
chunk_vec = np.array(item["embedding"]).reshape(1, -1)
|
| 502 |
+
sim = cosine_similarity(query_vec, chunk_vec)[0][0]
|
| 503 |
+
similarities.append((item["chunk"], sim))
|
| 504 |
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
| 505 |
+
return [chunk for chunk, _ in similarities[:top_n]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
def clean_text(text):
|
| 508 |
+
import re
|
| 509 |
+
return re.sub(r'[^آ-یa-zA-Z0-9۰-۹,.،؟!؛\s]+', '', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
+
query = st.text_input("سؤال خود را وارد کنید:")
|
| 513 |
+
|
| 514 |
+
if "chat_history" not in st.session_state:
|
| 515 |
+
st.session_state.chat_history = []
|
| 516 |
|
| 517 |
if query:
|
|
|
|
| 518 |
thinking = st.empty()
|
| 519 |
+
thinking.markdown("⏳ در حال پردازش...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
+
try:
|
| 522 |
+
query_embedding = get_query_embedding_together(query)
|
| 523 |
+
data = load_embeddings(EMBEDDING_FILE)
|
| 524 |
+
top_chunks = find_most_similar_chunks(query_embedding, data, top_n=3)
|
| 525 |
|
| 526 |
+
context = "\n".join(top_chunks)
|
| 527 |
prompt = f"""
|
| 528 |
+
فقط و فقط با استفاده از محتوای زیر به سؤال پاسخ بده.
|
| 529 |
+
اگر اطلاعات کافی نبود، واضح بگو اطلاعات کافی وجود ندارد، سپس با دانش عمومی پاسخ بده.
|
| 530 |
+
|
| 531 |
+
سؤال:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
{query}
|
| 533 |
+
|
| 534 |
+
محتوا:
|
| 535 |
+
{context}
|
| 536 |
+
|
| 537 |
پاسخ نهایی:
|
| 538 |
"""
|
| 539 |
+
|
| 540 |
response = llm([
|
| 541 |
SystemMessage(
|
| 542 |
+
content="تو یک دستیار دقیق هستی که فقط با اطلاعات موجود در متن پاسخ میدهی. اگر اطلاعات نبود، آن را اعلام میکنی و بعد از دانش خودت استفاده میکنی."
|
| 543 |
),
|
| 544 |
HumanMessage(content=prompt)
|
| 545 |
])
|
| 546 |
final_answer = clean_text(response.content.strip())
|
| 547 |
+
|
| 548 |
+
except Exception as e:
|
| 549 |
+
final_answer = f"❗ خطا: {str(e)}"
|
| 550 |
|
| 551 |
thinking.empty()
|
| 552 |
|
| 553 |
st.session_state.chat_history.append(("🧑", query))
|
| 554 |
st.session_state.chat_history.append(("🤖", final_answer))
|
| 555 |
|
| 556 |
+
# نمایش چت
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
st.markdown("---")
|
| 558 |
for sender, message in st.session_state.chat_history:
|
| 559 |
+
st.markdown(f'<div style="direction:rtl;text-align:right;padding:10px;border-radius:10px;background-color:#0d4d31;color:white;"><strong>{sender}</strong>: {message}</div>', unsafe_allow_html=True)
|