Update app.py
Browse files
app.py
CHANGED
|
@@ -1,45 +1,29 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
import os
|
| 3 |
-
import torch
|
| 4 |
-
import numpy as np
|
| 5 |
-
from hazm import *
|
| 6 |
import docx
|
| 7 |
-
from
|
| 8 |
-
from langchain.llms import OpenAI
|
| 9 |
-
from langchain.chat_models import ChatOpenAI
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
|
| 14 |
-
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")
|
| 15 |
-
|
| 16 |
-
@st.cache
|
| 17 |
-
def get_embedding(text):
|
| 18 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 19 |
-
with torch.no_grad():
|
| 20 |
-
outputs = model(**inputs)
|
| 21 |
-
embeddings = outputs.last_hidden_state.mean(dim=1)
|
| 22 |
-
return embeddings.squeeze().numpy()
|
| 23 |
-
|
| 24 |
-
def cosine_similarity(vec1, vec2):
|
| 25 |
-
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 26 |
-
|
| 27 |
-
llm = ChatOpenAI(
|
| 28 |
base_url="https://api.together.xyz/v1",
|
| 29 |
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
|
| 30 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
| 31 |
)
|
|
|
|
| 32 |
def rewrite_answer_with_llm(answer, user_input):
|
| 33 |
-
prompt = f"پاسخی که باید بازنویسی
|
| 34 |
-
response = llm(prompt)
|
| 35 |
-
return response['choices'][0]['text'].strip()
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
# بارگذاری
|
| 41 |
folder_path = '46'
|
| 42 |
texts = []
|
|
|
|
| 43 |
for filename in os.listdir(folder_path):
|
| 44 |
if filename.endswith(".docx"):
|
| 45 |
full_path = os.path.join(folder_path, filename)
|
|
@@ -48,30 +32,42 @@ for filename in os.listdir(folder_path):
|
|
| 48 |
if file_text.strip():
|
| 49 |
texts.append(file_text)
|
| 50 |
|
|
|
|
| 51 |
normalizer = Normalizer()
|
| 52 |
sentence_tokenizer = SentenceTokenizer()
|
|
|
|
| 53 |
all_sentences = []
|
| 54 |
for text in texts:
|
| 55 |
normalized = normalizer.normalize(text)
|
| 56 |
sentences = sentence_tokenizer.tokenize(normalized)
|
| 57 |
all_sentences.extend(sentences)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
with st.spinner("در حال محاسبه شباهتها..."):
|
| 68 |
-
user_embedding = get_embedding(user_input)
|
| 69 |
-
similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks]
|
| 70 |
-
most_similar_index = np.argmax(similarities)
|
| 71 |
-
most_similar_chunk = chunks[most_similar_index]
|
| 72 |
-
|
| 73 |
-
# بازنویسی پاسخ با مدل LLM
|
| 74 |
-
rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input)
|
| 75 |
-
|
| 76 |
-
st.subheader("📌 پاسخ بازنویسیشده:")
|
| 77 |
-
st.write(rewritten_answer)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from hazm import Normalizer, SentenceTokenizer
|
| 3 |
import os
|
|
|
|
|
|
|
|
|
|
| 4 |
import docx
|
| 5 |
+
from openai import OpenAI
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
# LLM setup
|
| 8 |
+
llm = OpenAI(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
base_url="https://api.together.xyz/v1",
|
| 10 |
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
|
| 11 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
| 12 |
)
|
| 13 |
+
|
| 14 |
def rewrite_answer_with_llm(answer, user_input):
|
| 15 |
+
prompt = f"پاسخی که باید بازنویسی شود:\n{answer}\n\nلطفاً این پاسخ را با در نظر گرفتن محتوای سوال زیر و لحن آن بازنویسی کن:\n\nسوال: {user_input}"
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
response = llm.chat.completions.create(
|
| 18 |
+
messages=[{"role": "user", "content": prompt}],
|
| 19 |
+
model=llm.model
|
| 20 |
+
)
|
| 21 |
+
return response.choices[0].message.content.strip()
|
| 22 |
|
| 23 |
+
# 📁 بارگذاری فایلهای کتاب
|
| 24 |
folder_path = '46'
|
| 25 |
texts = []
|
| 26 |
+
|
| 27 |
for filename in os.listdir(folder_path):
|
| 28 |
if filename.endswith(".docx"):
|
| 29 |
full_path = os.path.join(folder_path, filename)
|
|
|
|
| 32 |
if file_text.strip():
|
| 33 |
texts.append(file_text)
|
| 34 |
|
| 35 |
+
# 🌀 تبدیل کل کتاب به جملات
|
| 36 |
normalizer = Normalizer()
|
| 37 |
sentence_tokenizer = SentenceTokenizer()
|
| 38 |
+
|
| 39 |
all_sentences = []
|
| 40 |
for text in texts:
|
| 41 |
normalized = normalizer.normalize(text)
|
| 42 |
sentences = sentence_tokenizer.tokenize(normalized)
|
| 43 |
all_sentences.extend(sentences)
|
| 44 |
|
| 45 |
+
# 📌 دریافت ورودی از کاربر
|
| 46 |
+
query = st.text_input("🔎 کلمه یا عبارت موردنظر خود را وارد کنید:")
|
| 47 |
+
|
| 48 |
+
# ✅ نمایش جمله و ۵ جمله بعدی + بازنویسی با LLM
|
| 49 |
+
if query:
|
| 50 |
+
found = False
|
| 51 |
+
for idx, sentence in enumerate(all_sentences):
|
| 52 |
+
if query in sentence:
|
| 53 |
+
st.success("✅ جمله یافت شد:")
|
| 54 |
+
st.write(sentence)
|
| 55 |
+
|
| 56 |
+
next_sentences = []
|
| 57 |
+
st.markdown("📌 پنج جمله بعدی:")
|
| 58 |
+
for i in range(1, 6):
|
| 59 |
+
if idx + i < len(all_sentences):
|
| 60 |
+
st.write(all_sentences[idx + i])
|
| 61 |
+
next_sentences.append(all_sentences[idx + i])
|
| 62 |
+
|
| 63 |
+
# ↪️ آمادهسازی برای بازنویسی
|
| 64 |
+
total_text = sentence + " " + " ".join(next_sentences)
|
| 65 |
+
rewritten = rewrite_answer_with_llm(total_text, query)
|
| 66 |
+
st.markdown("🎨 **بازنویسی شده با LLM:**")
|
| 67 |
+
st.write(rewritten)
|
| 68 |
+
|
| 69 |
+
found = True
|
| 70 |
+
break
|
| 71 |
|
| 72 |
+
if not found:
|
| 73 |
+
st.warning("عبارت موردنظر در متن یافت نشد.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|