Spaces:

M17idd
/

army

Sleeping

App Files Files Community

M17idd commited on May 25, 2025

Commit

6eaa3d2

verified ·

1 Parent(s): 48acada

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -130

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ st.set_page_config(
     layout="wide"
 )
-st.title("دستیارهوشمند ارتش ")
 st.markdown("""
     <style>
@@ -463,162 +462,98 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-import os
-import re
-import docx
 import streamlit as st
-import concurrent.futures
-from hazm import Normalizer
-from rapidfuzz import fuzz
 from langchain.schema import SystemMessage, HumanMessage
 from langchain.chat_models import ChatOpenAI
-folder_path = '46'
-normalizer = Normalizer()
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-@st.cache_data(show_spinner="در حال بارگذاری اسناد...")
-def load_and_process_documents(path):
-    def process_docx(filename):
-        try:
-            full_path = os.path.join(path, filename)
-            doc = docx.Document(full_path)
-            text = "\n".join([para.text for para in doc.paragraphs])
-            normalized = normalizer.normalize(text)
-            return filename, normalized
-        except Exception as e:
-            return filename, ""
-    filenames = [f for f in os.listdir(path) if f.endswith(".docx")]
-    doc_texts = {}
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        for filename, content in executor.map(process_docx, filenames):
-            doc_texts[filename] = content
-    return doc_texts
-doc_texts = load_and_process_documents(folder_path)
-stop_words = [
-    "است", "و", "با", "که", "در", "از", "برای", "به", "بر", "تا", "این", "آن", "یک", "کدام", "کجا", "هم", "همه",
-    "یا", "همچنین", "می", "باید", "شود", "شد", "گفت", "گویا", "داشت", "داشتن", "کنند", "کنیم",
-    "کرد", "کردن", "نیز", "اگر", "ای", "اینکه", "نه", "باشید", "باشم", "باشی", "در حالی که", "مگر", "چرا",
-]
-def remove_stop_words(text, stop_words):
-    words = text.split()
-    return " ".join([word for word in words if word not in stop_words])
-def extract_keywords_from_text(text, query_words):
-    matched_lines = []
-    lines = text.split("\n")
-    for line in lines:
-        if any(query_word in line for query_word in query_words):
-            matched_lines.append(line)
-    return matched_lines
 def clean_text(text):
-    return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
-def find_closest_lines(query, doc_texts, stop_words, top_n=10):
-    cleaned_query = remove_stop_words(query, stop_words)
-    query_words = cleaned_query.split()
-    all_matched_lines = []
-    for filename, text in doc_texts.items():
-        matched_lines = extract_keywords_from_text(text, query_words)
-        for line in matched_lines:
-            similarity = fuzz.partial_ratio(query, line)
-            all_matched_lines.append((line, similarity))
-    all_matched_lines.sort(key=lambda x: x[1], reverse=True)
-    return [line for line, _ in all_matched_lines[:top_n]]
-def remove_stop_words_from_lines(lines, stop_words):
-    cleaned_lines = []
-    for line in lines:
-        words = line.split()
-        cleaned_words = [word for word in words if word not in stop_words]
-        cleaned_lines.append(" ".join(cleaned_words))
-    return cleaned_lines
-st.markdown("""
-    <style>
-    /* تنظیمات برای بالا بردن موقعیت input و ضخیم‌تر کردن فونت */
-    div[data-baseweb="input"] {
-        margin-top: 1px !important;      /* فاصله از بالا (کم کن یا زیاد کن به دلخواه) */
-        font-weight: 800 !important;      /* فونت کلفت */
-        font-size: 22px !important;       /* اندازه فونت بزرگتر */
-        font-family: "Vazir", sans-serif !important; /* اگر فونت فارسی دادی */
-        direction: rtl !important;        /* راست به چپ */
-        text-align: right !important;     /* متن راست چین */
-    }
-    </style>
-""", unsafe_allow_html=True)
-query = st.chat_input("چطور می‌تونم کمک کنم؟")
 if query:
     thinking = st.empty()
-    thinking.markdown("""
-        <div style="background-color:#0d4d31;padding:10px;border-radius:10px;">
-        ⏳ در حال فکر کردن...
-        </div>
-    """, unsafe_allow_html=True)
-    closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=2)
-    cleaned_closest_lines = remove_stop_words_from_lines(closest_lines, stop_words)
-    if cleaned_closest_lines:
         prompt = f"""
-        به سؤال زیر فقط بر اساس اطلاعات موجود در خطوط مرتبط پاسخ بده.
-        از تحلیل، مقدمه‌چینی، توضیح مراحل تفکر، یا حدس شخصی خودداری کن.
-        اگر اطلاعات کافی برای پاسخ دقیق در خطوط مرتبط وجود نداشت، فقط در آن صورت با صراحت اعلام کن،
-        و سپس می‌توانی از دانش عمومی خود استفاده کنی تا یک پاسخ حرفه‌ای و دقیق ارائه دهی.
-        پاسخ باید نهایی، روان، و در حدود 512 تا 2048 کاراکتر باشد.
-        مستقیماً پاسخ را بنویس و هیچ توضیحی درباره نحوه رسیدن به پاسخ نده.
-        سوال:
         {query}
-        خطوط مرتبط:
-        {cleaned_closest_lines}
         پاسخ نهایی:
         """
         response = llm([
             SystemMessage(
-    content="تو یک دستیار دقیق و مختصر هستی. همیشه پاسخ را مستقیماً بدون نشان دادن مراحل فکر یا تحلیل ارائه بده. اگر اطلاعات مرتبط در متن داده شده وجود داشت، فقط از همان استفاده کن. اگر هیچ اطلاعاتی نبود، می‌توانی از دانش عمومی خودت و متن استفاده کنی، ولی صریح اعلام کن که اطلاعاتی در متن نبود."
             ),
             HumanMessage(content=prompt)
         ])
         final_answer = clean_text(response.content.strip())
-    else:
-        final_answer = "❗ هیچ خط مرتبطی با سؤال پیدا نشد."
     thinking.empty()
     st.session_state.chat_history.append(("🧑", query))
     st.session_state.chat_history.append(("🤖", final_answer))
-st.markdown("""
-    <style>
-    @import url('https://cdn.fontcdn.ir/Font/Persian/Vazir/Vazir.css');
-    div.chat-message {
-        font-family: 'Vazir', sans-serif;
-        font-size: 16px;
-        color: white;
-        background-color: #0d4d31;
-        padding: 10px;
-        border-radius: 10px;
-        margin-bottom: 5px;
-    }
-    </style>
-""", unsafe_allow_html=True)
 st.markdown("---")
 for sender, message in st.session_state.chat_history:
-    st.markdown(f'<div class="chat-message"><strong>{sender}</strong>: {message}</div>', unsafe_allow_html=True)

     layout="wide"
 )
 st.markdown("""
     <style>
     </style>
 """, unsafe_allow_html=True)
+import json
+import requests
 import streamlit as st
+import numpy as np
 from langchain.schema import SystemMessage, HumanMessage
 from langchain.chat_models import ChatOpenAI
+from sklearn.metrics.pairwise import cosine_similarity
+EMBEDDING_FILE = "embeddings.json"
+EMBEDDING_MODEL = "intfloat/multilingual-e5-large-instruct"
+TOGETHER_API_KEY = "333ac33f5be91819cb7ade101134d73f5e63d299a964ae290850eeac5d82a8d5"
+@st.cache_data
+def load_embeddings(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def get_query_embedding_together(query):
+    url = "https://api.together.xyz/v1/embeddings"
+    headers = {
+        "Authorization": f"Bearer {TOGETHER_API_KEY}",
+        "accept": "application/json",
+        "content-type": "application/json"
+    }
+    payload = {
+        "model": EMBEDDING_MODEL,
+        "input": query
+    }
+    response = requests.post(url, headers=headers, json=payload)
+    response.raise_for_status()
+    return response.json()["data"][0]["embedding"]
+def find_most_similar_chunks(query_embedding, data, top_n=3):
+    query_vec = np.array(query_embedding).reshape(1, -1)
+    similarities = []
+    for item in data:
+        chunk_vec = np.array(item["embedding"]).reshape(1, -1)
+        sim = cosine_similarity(query_vec, chunk_vec)[0][0]
+        similarities.append((item["chunk"], sim))
+    similarities.sort(key=lambda x: x[1], reverse=True)
+    return [chunk for chunk, _ in similarities[:top_n]]
 def clean_text(text):
+    import re
+    return re.sub(r'[^آ-یa-zA-Z0-9۰-۹,.،؟!؛\s]+', '', text)
+query = st.text_input("سؤال خود را وارد کنید:")
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
 if query:
     thinking = st.empty()
+    thinking.markdown("⏳ در حال پردازش...")
+    try:
+        query_embedding = get_query_embedding_together(query)
+        data = load_embeddings(EMBEDDING_FILE)
+        top_chunks = find_most_similar_chunks(query_embedding, data, top_n=3)
+        context = "\n".join(top_chunks)
         prompt = f"""
+        فقط و فقط با استفاده از محتوای زیر به سؤال پاسخ بده.
+        اگر اطلاعات کافی نبود، واضح بگو اطلاعات کافی وجود ندارد، سپس با دانش عمومی پاسخ بده.
+        سؤال:
         {query}
+        محتوا:
+        {context}
         پاسخ نهایی:
         """
         response = llm([
             SystemMessage(
+                content="تو یک دستیار دقیق هستی که فقط با اطلاعات موجود در متن پاسخ می‌دهی. اگر اطلاعات نبود، آن را اعلام می‌کنی و بعد از دانش خودت استفاده می‌کنی."
             ),
             HumanMessage(content=prompt)
         ])
         final_answer = clean_text(response.content.strip())
+    except Exception as e:
+        final_answer = f"❗ خطا: {str(e)}"
     thinking.empty()
     st.session_state.chat_history.append(("🧑", query))
     st.session_state.chat_history.append(("🤖", final_answer))
+# نمایش چت
 st.markdown("---")
 for sender, message in st.session_state.chat_history:
+    st.markdown(f'<div style="direction:rtl;text-align:right;padding:10px;border-radius:10px;background-color:#0d4d31;color:white;"><strong>{sender}</strong>: {message}</div>', unsafe_allow_html=True)