Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,6 +14,7 @@ import re
|
|
| 14 |
import requests
|
| 15 |
from bs4 import BeautifulSoup
|
| 16 |
from difflib import SequenceMatcher
|
|
|
|
| 17 |
|
| 18 |
# تنظیم لاگگیری
|
| 19 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
@@ -36,7 +37,7 @@ def process_single_pdf(pdf_file):
|
|
| 36 |
try:
|
| 37 |
pages = loader.load_and_split()
|
| 38 |
docs = text_splitter.split_documents(pages)
|
| 39 |
-
sections = {"مقدمه": [], "روششناسی": [], "نتایج": [], "بحث": []}
|
| 40 |
for doc in docs:
|
| 41 |
text = doc.page_content
|
| 42 |
if re.search(r"مقدمه|Introduction", text, re.I):
|
|
@@ -47,6 +48,8 @@ def process_single_pdf(pdf_file):
|
|
| 47 |
sections["نتایج"].append(doc)
|
| 48 |
elif re.search(r"بحث|Discussion", text, re.I):
|
| 49 |
sections["بحث"].append(doc)
|
|
|
|
|
|
|
| 50 |
logger.info(f"پردازش فایل: {pdf_path} - تعداد تکهها: {len(docs)}")
|
| 51 |
return docs, sections
|
| 52 |
except Exception as e:
|
|
@@ -58,7 +61,7 @@ def upload_and_process_pdf(pdf_files):
|
|
| 58 |
return None, None, None, "لطفاً حداقل یک فایل PDF آپلود کنید."
|
| 59 |
logger.info(f"تعداد فایلهای ورودی: {len(pdf_files)}")
|
| 60 |
all_docs = []
|
| 61 |
-
all_sections = {"مقدمه": [], "روششناسی": [], "نتایج": [], "بحث": []}
|
| 62 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 63 |
future_to_file = {executor.submit(process_single_pdf, pdf_file): pdf_file for pdf_file in pdf_files}
|
| 64 |
for future in concurrent.futures.as_completed(future_to_file):
|
|
@@ -89,53 +92,53 @@ def create_vector_db(docs):
|
|
| 89 |
def check_plagiarism(text):
|
| 90 |
try:
|
| 91 |
query = text[:100]
|
| 92 |
-
# جستجوی گوگل (
|
| 93 |
-
url_global = f"https://www.google.com/search?q={query}"
|
| 94 |
-
response_global = requests.get(url_global, headers={"User-Agent": "Mozilla/5.0"})
|
| 95 |
-
soup_global = BeautifulSoup(response_global.text, 'html.parser')
|
| 96 |
-
results_global = [h.get_text() for h in soup_global.find_all('h3')[:3]]
|
| 97 |
-
|
| 98 |
-
# جستجوی گوگل با تنظیم فارسی
|
| 99 |
url_fa = f"https://www.google.com/search?q={query}&hl=fa"
|
| 100 |
response_fa = requests.get(url_fa, headers={"User-Agent": "Mozilla/5.0"})
|
| 101 |
soup_fa = BeautifulSoup(response_fa.text, 'html.parser')
|
| 102 |
-
results_fa = [h.get_text() for h in soup_fa.find_all('h3')[:
|
| 103 |
|
| 104 |
# جستجو در SID.ir
|
| 105 |
url_sid = f"https://www.sid.ir/Fa/Journal/SearchPaper.aspx?str={query}"
|
| 106 |
response_sid = requests.get(url_sid, headers={"User-Agent": "Mozilla/5.0"})
|
| 107 |
soup_sid = BeautifulSoup(response_sid.text, 'html.parser')
|
| 108 |
-
results_sid = [item.get_text() for item in soup_sid.select('.title')[:
|
| 109 |
|
| 110 |
-
|
|
|
|
| 111 |
max_similarity = 0
|
|
|
|
| 112 |
for result in all_results:
|
| 113 |
similarity = SequenceMatcher(None, text[:500], result).ratio()
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
time.sleep(1)
|
| 117 |
-
|
|
|
|
|
|
|
| 118 |
except Exception as e:
|
| 119 |
logger.error(f"خطا در چک سرقت ادبی: {str(e)}")
|
| 120 |
-
return
|
| 121 |
|
| 122 |
def suggest_resources(text):
|
| 123 |
try:
|
| 124 |
query = " ".join(text.split()[:5])
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
url_arxiv = f"https://arxiv.org/search/?query={query}&searchtype=all&source=header"
|
| 127 |
response_arxiv = requests.get(url_arxiv, headers={"User-Agent": "Mozilla/5.0"})
|
| 128 |
soup_arxiv = BeautifulSoup(response_arxiv.text, 'html.parser')
|
| 129 |
papers_arxiv = [paper.get_text().strip() for paper in soup_arxiv.find_all('p', class_='title')[:2]]
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
response_sid = requests.get(url_sid, headers={"User-Agent": "Mozilla/5.0"})
|
| 134 |
-
soup_sid = BeautifulSoup(response_sid.text, 'html.parser')
|
| 135 |
-
papers_sid = [item.get_text().strip() for item in soup_sid.select('.title')[:2]]
|
| 136 |
-
|
| 137 |
-
resources = papers_arxiv + papers_sid if papers_sid else papers_arxiv
|
| 138 |
-
time.sleep(1) # فاصله برای جلوگیری از 429
|
| 139 |
return resources if resources else ["منبعی یافت نشد."]
|
| 140 |
except Exception as e:
|
| 141 |
logger.error(f"خطا در پیشنهاد منابع: {str(e)}")
|
|
@@ -144,15 +147,45 @@ def suggest_resources(text):
|
|
| 144 |
def evaluate_quality(docs):
|
| 145 |
text = " ".join([doc.page_content for doc in docs])
|
| 146 |
score = 0
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
if re.search(r"جدول|شکل|Table|Figure", text, re.I):
|
| 152 |
score += 20
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
score += 20
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
llm_gemini = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=gemini_api_key, convert_system_message_to_human=True, temperature=0.5)
|
| 158 |
|
|
@@ -169,7 +202,11 @@ academic_analysis_prompt = PromptTemplate(
|
|
| 169 |
)
|
| 170 |
|
| 171 |
summary_prompt = PromptTemplate(
|
| 172 |
-
template="""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
**متن:**
|
| 174 |
{context}
|
| 175 |
**خلاصه:**""",
|
|
@@ -187,16 +224,16 @@ plagiarism_prompt = PromptTemplate(
|
|
| 187 |
template="""درصد تشابه متن زیر با منابع عمومی و فارسی را گزارش دهید:
|
| 188 |
**متن:**
|
| 189 |
{context}
|
| 190 |
-
**نتیجه:** {similarity}
|
| 191 |
input_variables=["context", "similarity"]
|
| 192 |
)
|
| 193 |
|
| 194 |
quality_prompt = PromptTemplate(
|
| 195 |
-
template="""
|
| 196 |
**متن:**
|
| 197 |
{context}
|
| 198 |
**امتیاز:** {score}/100
|
| 199 |
-
|
| 200 |
input_variables=["context", "score", "explanation"]
|
| 201 |
)
|
| 202 |
|
|
@@ -252,17 +289,16 @@ def academic_chatbot(pdf_file, mode, query, language, detail_level, section_drop
|
|
| 252 |
try:
|
| 253 |
if mode == "خلاصه خودکار":
|
| 254 |
context = " ".join([doc.page_content for doc in docs])
|
| 255 |
-
time.sleep(2)
|
| 256 |
-
result = chain.invoke({"context": context[:5000]})["text"]
|
| 257 |
elif mode == "چک سرقت ادبی":
|
| 258 |
context = " ".join([doc.page_content for doc in docs if section_dropdown == "کل سند" or doc in sections.get(section_dropdown, [])])
|
| 259 |
-
|
| 260 |
-
result =
|
| 261 |
elif mode == "ارزیابی کیفیت":
|
| 262 |
context = " ".join([doc.page_content for doc in docs if section_dropdown == "کل سند" or doc in sections.get(section_dropdown, [])])
|
| 263 |
-
score = evaluate_quality(docs if section_dropdown == "کل سند" else sections.get(section_dropdown, []))
|
| 264 |
-
|
| 265 |
-
time.sleep(2) # فاصله برای کاهش درخواستها
|
| 266 |
result = chain.invoke({"context": context[:5000], "score": score, "explanation": explanation})["text"]
|
| 267 |
else:
|
| 268 |
result = chain.invoke({"question": query, "chat_history": []})["answer"]
|
|
@@ -290,7 +326,7 @@ with gr.Blocks(title="تحلیلگر حرفهای پایاننامه") as
|
|
| 290 |
value="تحلیل آکادمیک (RAG)"
|
| 291 |
)
|
| 292 |
query = gr.Textbox(lines=3, placeholder="سوال یا درخواست خود را بنویسید...", label="سوال/درخواست")
|
| 293 |
-
section = gr.Dropdown(["کل سند", "مقدمه", "روششناسی", "نتایج", "بحث"], label="بخش موردنظر", value="کل سند")
|
| 294 |
language = gr.Dropdown(["فارسی", "English"], label="زبان پاسخ", value="فارسی")
|
| 295 |
detail = gr.Dropdown(["خلاصه", "جامع"], label="سطح جزئیات", value="جامع")
|
| 296 |
submit = gr.Button("ارسال")
|
|
|
|
| 14 |
import requests
|
| 15 |
from bs4 import BeautifulSoup
|
| 16 |
from difflib import SequenceMatcher
|
| 17 |
+
from collections import Counter
|
| 18 |
|
| 19 |
# تنظیم لاگگیری
|
| 20 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
| 37 |
try:
|
| 38 |
pages = loader.load_and_split()
|
| 39 |
docs = text_splitter.split_documents(pages)
|
| 40 |
+
sections = {"مقدمه": [], "روششناسی": [], "نتایج": [], "بحث": [], "منابع": []}
|
| 41 |
for doc in docs:
|
| 42 |
text = doc.page_content
|
| 43 |
if re.search(r"مقدمه|Introduction", text, re.I):
|
|
|
|
| 48 |
sections["نتایج"].append(doc)
|
| 49 |
elif re.search(r"بحث|Discussion", text, re.I):
|
| 50 |
sections["بحث"].append(doc)
|
| 51 |
+
elif re.search(r"منابع|References|Bibliography", text, re.I):
|
| 52 |
+
sections["منابع"].append(doc)
|
| 53 |
logger.info(f"پردازش فایل: {pdf_path} - تعداد تکهها: {len(docs)}")
|
| 54 |
return docs, sections
|
| 55 |
except Exception as e:
|
|
|
|
| 61 |
return None, None, None, "لطفاً حداقل یک فایل PDF آپلود کنید."
|
| 62 |
logger.info(f"تعداد فایلهای ورودی: {len(pdf_files)}")
|
| 63 |
all_docs = []
|
| 64 |
+
all_sections = {"مقدمه": [], "روششناسی": [], "نتایج": [], "بحث": [], "منابع": []}
|
| 65 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 66 |
future_to_file = {executor.submit(process_single_pdf, pdf_file): pdf_file for pdf_file in pdf_files}
|
| 67 |
for future in concurrent.futures.as_completed(future_to_file):
|
|
|
|
| 92 |
def check_plagiarism(text):
|
| 93 |
try:
|
| 94 |
query = text[:100]
|
| 95 |
+
# جستجوی گوگل (فارسی)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
url_fa = f"https://www.google.com/search?q={query}&hl=fa"
|
| 97 |
response_fa = requests.get(url_fa, headers={"User-Agent": "Mozilla/5.0"})
|
| 98 |
soup_fa = BeautifulSoup(response_fa.text, 'html.parser')
|
| 99 |
+
results_fa = [h.get_text() for h in soup_fa.find_all('h3')[:5]]
|
| 100 |
|
| 101 |
# جستجو در SID.ir
|
| 102 |
url_sid = f"https://www.sid.ir/Fa/Journal/SearchPaper.aspx?str={query}"
|
| 103 |
response_sid = requests.get(url_sid, headers={"User-Agent": "Mozilla/5.0"})
|
| 104 |
soup_sid = BeautifulSoup(response_sid.text, 'html.parser')
|
| 105 |
+
results_sid = [item.get_text() for item in soup_sid.select('.title')[:5]]
|
| 106 |
|
| 107 |
+
# ترکیب نتایج
|
| 108 |
+
all_results = results_fa + results_sid
|
| 109 |
max_similarity = 0
|
| 110 |
+
matched_texts = []
|
| 111 |
for result in all_results:
|
| 112 |
similarity = SequenceMatcher(None, text[:500], result).ratio()
|
| 113 |
+
if similarity > max_similarity:
|
| 114 |
+
max_similarity = similarity
|
| 115 |
+
matched_texts = [result]
|
| 116 |
+
elif similarity == max_similarity:
|
| 117 |
+
matched_texts.append(result)
|
| 118 |
|
| 119 |
+
time.sleep(1)
|
| 120 |
+
similarity_percent = max_similarity * 100
|
| 121 |
+
output = f"درصد تشابه: {similarity_percent:.2f}%\n**متنهای مشابه:**\n" + "\n".join(matched_texts[:3])
|
| 122 |
+
return output
|
| 123 |
except Exception as e:
|
| 124 |
logger.error(f"خطا در چک سرقت ادبی: {str(e)}")
|
| 125 |
+
return f"خطا در بررسی سرقت ادبی: {str(e)}"
|
| 126 |
|
| 127 |
def suggest_resources(text):
|
| 128 |
try:
|
| 129 |
query = " ".join(text.split()[:5])
|
| 130 |
+
url_sid = f"https://www.sid.ir/Fa/Journal/SearchPaper.aspx?str={query}"
|
| 131 |
+
response_sid = requests.get(url_sid, headers={"User-Agent": "Mozilla/5.0"})
|
| 132 |
+
soup_sid = BeautifulSoup(response_sid.text, 'html.parser')
|
| 133 |
+
papers_sid = [item.get_text().strip() for item in soup_sid.select('.title')[:3]]
|
| 134 |
+
|
| 135 |
url_arxiv = f"https://arxiv.org/search/?query={query}&searchtype=all&source=header"
|
| 136 |
response_arxiv = requests.get(url_arxiv, headers={"User-Agent": "Mozilla/5.0"})
|
| 137 |
soup_arxiv = BeautifulSoup(response_arxiv.text, 'html.parser')
|
| 138 |
papers_arxiv = [paper.get_text().strip() for paper in soup_arxiv.find_all('p', class_='title')[:2]]
|
| 139 |
+
|
| 140 |
+
resources = papers_sid + papers_arxiv if papers_sid else papers_arxiv
|
| 141 |
+
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
return resources if resources else ["منبعی یافت نشد."]
|
| 143 |
except Exception as e:
|
| 144 |
logger.error(f"خطا در پیشنهاد منابع: {str(e)}")
|
|
|
|
| 147 |
def evaluate_quality(docs):
|
| 148 |
text = " ".join([doc.page_content for doc in docs])
|
| 149 |
score = 0
|
| 150 |
+
explanation = []
|
| 151 |
+
|
| 152 |
+
# معیار 1: وجود و کیفیت منابع
|
| 153 |
+
ref_count = len(re.findall(r"\[\d+\]|[A-Za-z]+\s+\d{4}", text)) # ارجاعات مثل [1] یا نام (سال)
|
| 154 |
+
if ref_count > 10:
|
| 155 |
+
score += 35
|
| 156 |
+
explanation.append("منابع کافی و قابل استناد (بیش از 10 ارجاع).")
|
| 157 |
+
elif ref_count > 0:
|
| 158 |
+
score += 20
|
| 159 |
+
explanation.append("منابع موجود اما محدود (کمتر از 10 ارجاع).")
|
| 160 |
+
else:
|
| 161 |
+
explanation.append("منابع کافی یافت نشد.")
|
| 162 |
+
|
| 163 |
+
# معیار 2: انسجام متن (تکرار کلمات کلیدی)
|
| 164 |
+
words = text.split()
|
| 165 |
+
word_freq = Counter(words).most_common(10)
|
| 166 |
+
if word_freq and word_freq[0][1] > len(words) * 0.02: # کلمه تکراری بیش از 2%
|
| 167 |
+
score += 25
|
| 168 |
+
explanation.append("انسجام متنی خوب (تمرکز بر موضوع اصلی).")
|
| 169 |
+
else:
|
| 170 |
+
explanation.append("انسجام متنی ضعیف (پراکندگی موضوعی).")
|
| 171 |
+
|
| 172 |
+
# معیار 3: استفاده از جداول/شکلها
|
| 173 |
if re.search(r"جدول|شکل|Table|Figure", text, re.I):
|
| 174 |
score += 20
|
| 175 |
+
explanation.append("استفاده از جداول یا شکلها برای پشتیبانی یافتهها.")
|
| 176 |
+
else:
|
| 177 |
+
explanation.append("عدم استفاده از جداول یا شکلها.")
|
| 178 |
+
|
| 179 |
+
# معیار 4: عمق تحلیل (طول بخش نتایج/بحث)
|
| 180 |
+
analysis_text = " ".join([doc.page_content for doc in docs if doc in sections.get("نتایج", []) + sections.get("بحث", [])])
|
| 181 |
+
if len(analysis_text.split()) > 1000:
|
| 182 |
score += 20
|
| 183 |
+
explanation.append("عمق تحلیل قابل قبول (بخش نتایج/بحث طولانی).")
|
| 184 |
+
else:
|
| 185 |
+
explanation.append("عمق تحلیل محدود (بخش نتایج/بحث کوتاه).")
|
| 186 |
+
|
| 187 |
+
score = max(min(score, 100), 10)
|
| 188 |
+
return score, "; ".join(explanation)
|
| 189 |
|
| 190 |
llm_gemini = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=gemini_api_key, convert_system_message_to_human=True, temperature=0.5)
|
| 191 |
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
summary_prompt = PromptTemplate(
|
| 205 |
+
template="""شما یک متخصص نگارش آکادمیک هستید. یک خلاصه علمی ساختارمند (200-300 کلمه) از متن زیر به زبان {language} تولید کنید که شامل:
|
| 206 |
+
1. هدف تحقیق
|
| 207 |
+
2. روششناسی
|
| 208 |
+
3. یافتههای اصلی
|
| 209 |
+
4. نتیجهگیری
|
| 210 |
**متن:**
|
| 211 |
{context}
|
| 212 |
**خلاصه:**""",
|
|
|
|
| 224 |
template="""درصد تشابه متن زیر با منابع عمومی و فارسی را گزارش دهید:
|
| 225 |
**متن:**
|
| 226 |
{context}
|
| 227 |
+
**نتیجه:** {similarity}""",
|
| 228 |
input_variables=["context", "similarity"]
|
| 229 |
)
|
| 230 |
|
| 231 |
quality_prompt = PromptTemplate(
|
| 232 |
+
template="""شما یک ارزیاب آکادمیک حرفهای هستید. کیفیت علمی متن زیر را ارزیابی کنید:
|
| 233 |
**متن:**
|
| 234 |
{context}
|
| 235 |
**امتیاز:** {score}/100
|
| 236 |
+
**توضیحات:** {explanation}""",
|
| 237 |
input_variables=["context", "score", "explanation"]
|
| 238 |
)
|
| 239 |
|
|
|
|
| 289 |
try:
|
| 290 |
if mode == "خلاصه خودکار":
|
| 291 |
context = " ".join([doc.page_content for doc in docs])
|
| 292 |
+
time.sleep(2)
|
| 293 |
+
result = chain.invoke({"context": context[:5000]})["text"]
|
| 294 |
elif mode == "چک سرقت ادبی":
|
| 295 |
context = " ".join([doc.page_content for doc in docs if section_dropdown == "کل سند" or doc in sections.get(section_dropdown, [])])
|
| 296 |
+
plagiarism_result = check_plagiarism(context)
|
| 297 |
+
result = plagiarism_result
|
| 298 |
elif mode == "ارزیابی کیفیت":
|
| 299 |
context = " ".join([doc.page_content for doc in docs if section_dropdown == "کل سند" or doc in sections.get(section_dropdown, [])])
|
| 300 |
+
score, explanation = evaluate_quality(docs if section_dropdown == "کل سند" else sections.get(section_dropdown, []))
|
| 301 |
+
time.sleep(2)
|
|
|
|
| 302 |
result = chain.invoke({"context": context[:5000], "score": score, "explanation": explanation})["text"]
|
| 303 |
else:
|
| 304 |
result = chain.invoke({"question": query, "chat_history": []})["answer"]
|
|
|
|
| 326 |
value="تحلیل آکادمیک (RAG)"
|
| 327 |
)
|
| 328 |
query = gr.Textbox(lines=3, placeholder="سوال یا درخواست خود را بنویسید...", label="سوال/درخواست")
|
| 329 |
+
section = gr.Dropdown(["کل سند", "مقدمه", "روششناسی", "نتایج", "بحث", "منابع"], label="بخش موردنظر", value="کل سند")
|
| 330 |
language = gr.Dropdown(["فارسی", "English"], label="زبان پاسخ", value="فارسی")
|
| 331 |
detail = gr.Dropdown(["خلاصه", "جامع"], label="سطح جزئیات", value="جامع")
|
| 332 |
submit = gr.Button("ارسال")
|