Update app.py
Browse files
app.py
CHANGED
|
@@ -502,11 +502,22 @@ import concurrent.futures
|
|
| 502 |
from hazm import Normalizer
|
| 503 |
from rapidfuzz import fuzz
|
| 504 |
from langchain.schema import SystemMessage, HumanMessage
|
|
|
|
| 505 |
|
|
|
|
| 506 |
folder_path = '46'
|
| 507 |
normalizer = Normalizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
-
|
|
|
|
| 510 |
def load_and_process_documents(path):
|
| 511 |
def process_docx(filename):
|
| 512 |
try:
|
|
@@ -516,12 +527,10 @@ def load_and_process_documents(path):
|
|
| 516 |
normalized = normalizer.normalize(text)
|
| 517 |
return filename, normalized
|
| 518 |
except Exception as e:
|
| 519 |
-
print(f"Error processing {filename}: {e}")
|
| 520 |
return filename, ""
|
| 521 |
|
| 522 |
filenames = [f for f in os.listdir(path) if f.endswith(".docx")]
|
| 523 |
doc_texts = {}
|
| 524 |
-
|
| 525 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 526 |
for filename, content in executor.map(process_docx, filenames):
|
| 527 |
doc_texts[filename] = content
|
|
@@ -530,57 +539,43 @@ def load_and_process_documents(path):
|
|
| 530 |
|
| 531 |
doc_texts = load_and_process_documents(folder_path)
|
| 532 |
|
| 533 |
-
#
|
| 534 |
stop_words = [
|
| 535 |
"است", "و", "با", "که", "در", "از", "برای", "به", "بر", "تا", "این", "آن", "یک", "کدام", "کجا", "هم", "همه",
|
| 536 |
-
"یا", "
|
| 537 |
-
"کرد", "کردن", "نیز", "
|
| 538 |
]
|
| 539 |
|
| 540 |
-
#
|
| 541 |
def remove_stop_words(text, stop_words):
|
| 542 |
words = text.split()
|
| 543 |
return " ".join([word for word in words if word not in stop_words])
|
| 544 |
|
| 545 |
-
# تابعی برای استخراج کلمات از متن
|
| 546 |
def extract_keywords_from_text(text, query_words):
|
| 547 |
matched_lines = []
|
| 548 |
lines = text.split("\n")
|
| 549 |
-
|
| 550 |
-
# جستجو برای هر کلمه در هر خط
|
| 551 |
for line in lines:
|
| 552 |
if any(query_word in line for query_word in query_words):
|
| 553 |
matched_lines.append(line)
|
| 554 |
return matched_lines
|
| 555 |
|
| 556 |
-
# تابعی برای پاکسازی متن
|
| 557 |
def clean_text(text):
|
| 558 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
| 559 |
|
| 560 |
-
# تابعی برای پیدا کردن نزدیکترین خطوط به سوال
|
| 561 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
| 562 |
-
# حذف کلمات اضافی از سوال
|
| 563 |
cleaned_query = remove_stop_words(query, stop_words)
|
| 564 |
query_words = cleaned_query.split()
|
| 565 |
-
|
| 566 |
all_matched_lines = []
|
| 567 |
-
|
| 568 |
-
# بررسی محتوای فایلها
|
| 569 |
for filename, text in doc_texts.items():
|
| 570 |
matched_lines = extract_keywords_from_text(text, query_words)
|
| 571 |
for line in matched_lines:
|
| 572 |
-
similarity = fuzz.partial_ratio(query, line)
|
| 573 |
all_matched_lines.append((line, similarity))
|
| 574 |
-
|
| 575 |
-
# مرتب سازی بر اساس شباهت
|
| 576 |
-
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
| 577 |
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
return closest_lines
|
| 582 |
|
| 583 |
-
# تابعی برای حذف کلمات توقف از یک لیست از خطوط
|
| 584 |
def remove_stop_words_from_lines(lines, stop_words):
|
| 585 |
cleaned_lines = []
|
| 586 |
for line in lines:
|
|
@@ -589,25 +584,23 @@ def remove_stop_words_from_lines(lines, stop_words):
|
|
| 589 |
cleaned_lines.append(" ".join(cleaned_words))
|
| 590 |
return cleaned_lines
|
| 591 |
|
| 592 |
-
|
| 593 |
-
st.session_state.chat_history = []
|
| 594 |
-
|
| 595 |
query = st.chat_input("چطور میتونم کمک کنم؟")
|
| 596 |
|
| 597 |
-
|
| 598 |
if query:
|
| 599 |
-
|
|
|
|
| 600 |
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
<div
|
| 605 |
-
|
|
|
|
| 606 |
""", unsafe_allow_html=True)
|
| 607 |
-
# پیدا کردن ۱۰ خط نزدیکتر به سوال
|
| 608 |
-
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=3)
|
| 609 |
|
| 610 |
-
#
|
|
|
|
| 611 |
cleaned_closest_lines = remove_stop_words_from_lines(closest_lines, stop_words)
|
| 612 |
|
| 613 |
if cleaned_closest_lines:
|
|
@@ -624,18 +617,18 @@ if query:
|
|
| 624 |
SystemMessage(content="You are a helpful assistant."),
|
| 625 |
HumanMessage(content=prompt)
|
| 626 |
])
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
st.session_state.chat_history.append(("🧑", query))
|
| 631 |
-
st.session_state.chat_history.append(("🤖", rewritten))
|
| 632 |
|
| 633 |
-
|
|
|
|
| 634 |
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
|
| 639 |
# نمایش تاریخچه گفتگو
|
| 640 |
-
|
| 641 |
-
|
|
|
|
|
|
| 502 |
from hazm import Normalizer
|
| 503 |
from rapidfuzz import fuzz
|
| 504 |
from langchain.schema import SystemMessage, HumanMessage
|
| 505 |
+
from langchain.chat_models import ChatOpenAI
|
| 506 |
|
| 507 |
+
# تنظیمات
|
| 508 |
folder_path = '46'
|
| 509 |
normalizer = Normalizer()
|
| 510 |
+
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
|
| 511 |
+
|
| 512 |
+
st.set_page_config(page_title="پرسش از اسناد", layout="wide")
|
| 513 |
+
st.title("📄 دستیار هوشمند پرسش از اسناد")
|
| 514 |
+
|
| 515 |
+
# حافظه گفتگو
|
| 516 |
+
if "chat_history" not in st.session_state:
|
| 517 |
+
st.session_state.chat_history = []
|
| 518 |
|
| 519 |
+
# بارگذاری اسناد
|
| 520 |
+
@st.cache_data(show_spinner="در حال بارگذاری اسناد...")
|
| 521 |
def load_and_process_documents(path):
|
| 522 |
def process_docx(filename):
|
| 523 |
try:
|
|
|
|
| 527 |
normalized = normalizer.normalize(text)
|
| 528 |
return filename, normalized
|
| 529 |
except Exception as e:
|
|
|
|
| 530 |
return filename, ""
|
| 531 |
|
| 532 |
filenames = [f for f in os.listdir(path) if f.endswith(".docx")]
|
| 533 |
doc_texts = {}
|
|
|
|
| 534 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 535 |
for filename, content in executor.map(process_docx, filenames):
|
| 536 |
doc_texts[filename] = content
|
|
|
|
| 539 |
|
| 540 |
doc_texts = load_and_process_documents(folder_path)
|
| 541 |
|
| 542 |
+
# کلمات توقف
|
| 543 |
stop_words = [
|
| 544 |
"است", "و", "با", "که", "در", "از", "برای", "به", "بر", "تا", "این", "آن", "یک", "کدام", "کجا", "هم", "همه",
|
| 545 |
+
"یا", "همچنین", "می", "باید", "شود", "شد", "گفت", "گویا", "داشت", "داشتن", "کنند", "کنیم",
|
| 546 |
+
"کرد", "کردن", "نیز", "اگر", "ای", "اینکه", "نه", "باشید", "باشم", "باشی", "در حالی که", "مگر", "چرا"
|
| 547 |
]
|
| 548 |
|
| 549 |
+
# توابع کمکی
|
| 550 |
def remove_stop_words(text, stop_words):
|
| 551 |
words = text.split()
|
| 552 |
return " ".join([word for word in words if word not in stop_words])
|
| 553 |
|
|
|
|
| 554 |
def extract_keywords_from_text(text, query_words):
|
| 555 |
matched_lines = []
|
| 556 |
lines = text.split("\n")
|
|
|
|
|
|
|
| 557 |
for line in lines:
|
| 558 |
if any(query_word in line for query_word in query_words):
|
| 559 |
matched_lines.append(line)
|
| 560 |
return matched_lines
|
| 561 |
|
|
|
|
| 562 |
def clean_text(text):
|
| 563 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
| 564 |
|
|
|
|
| 565 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
|
|
|
| 566 |
cleaned_query = remove_stop_words(query, stop_words)
|
| 567 |
query_words = cleaned_query.split()
|
|
|
|
| 568 |
all_matched_lines = []
|
| 569 |
+
|
|
|
|
| 570 |
for filename, text in doc_texts.items():
|
| 571 |
matched_lines = extract_keywords_from_text(text, query_words)
|
| 572 |
for line in matched_lines:
|
| 573 |
+
similarity = fuzz.partial_ratio(query, line)
|
| 574 |
all_matched_lines.append((line, similarity))
|
|
|
|
|
|
|
|
|
|
| 575 |
|
| 576 |
+
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
| 577 |
+
return [line for line, _ in all_matched_lines[:top_n]]
|
|
|
|
|
|
|
| 578 |
|
|
|
|
| 579 |
def remove_stop_words_from_lines(lines, stop_words):
|
| 580 |
cleaned_lines = []
|
| 581 |
for line in lines:
|
|
|
|
| 584 |
cleaned_lines.append(" ".join(cleaned_words))
|
| 585 |
return cleaned_lines
|
| 586 |
|
| 587 |
+
# ورودی کاربر با chat_input
|
|
|
|
|
|
|
| 588 |
query = st.chat_input("چطور میتونم کمک کنم؟")
|
| 589 |
|
|
|
|
| 590 |
if query:
|
| 591 |
+
# نمایش پیام کاربر
|
| 592 |
+
st.markdown(f'<div style="background-color:#eef;padding:10px;border-radius:10px;margin:10px 0;"><strong>🧑:</strong> {query}</div>', unsafe_allow_html=True)
|
| 593 |
|
| 594 |
+
# نشان دادن وضعیت در حال فکر کردن
|
| 595 |
+
thinking = st.empty()
|
| 596 |
+
thinking.markdown("""
|
| 597 |
+
<div style="background-color:#f9f9f9;padding:10px;border-radius:10px;">
|
| 598 |
+
⏳ در حال فکر کردن...
|
| 599 |
+
</div>
|
| 600 |
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
| 601 |
|
| 602 |
+
# جستجو در متن اسناد
|
| 603 |
+
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=3)
|
| 604 |
cleaned_closest_lines = remove_stop_words_from_lines(closest_lines, stop_words)
|
| 605 |
|
| 606 |
if cleaned_closest_lines:
|
|
|
|
| 617 |
SystemMessage(content="You are a helpful assistant."),
|
| 618 |
HumanMessage(content=prompt)
|
| 619 |
])
|
| 620 |
+
final_answer = clean_text(response.content.strip())
|
| 621 |
+
else:
|
| 622 |
+
final_answer = "❗ هیچ خط مرتبطی با سؤال پیدا نشد."
|
|
|
|
|
|
|
| 623 |
|
| 624 |
+
# پاک کردن وضعیت در حال فکر کردن
|
| 625 |
+
thinking.empty()
|
| 626 |
|
| 627 |
+
# ذخیره و نمایش پاسخ
|
| 628 |
+
st.session_state.chat_history.append(("🧑", query))
|
| 629 |
+
st.session_state.chat_history.append(("🤖", final_answer))
|
| 630 |
|
| 631 |
# نمایش تاریخچه گفتگو
|
| 632 |
+
st.markdown("---")
|
| 633 |
+
for sender, message in st.session_state.chat_history:
|
| 634 |
+
st.markdown(f'<div style="background-color:#f0f0f0;padding:10px;border-radius:10px;margin-bottom:5px;"><strong>{sender}</strong>: {message}</div>', unsafe_allow_html=True)
|