Spaces:
Sleeping
Sleeping
| import os | |
| import google.generativeai as genai | |
| import gradio as gr | |
| from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import ConversationalRetrievalChain, LLMChain | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| import time | |
| import concurrent.futures | |
| import logging | |
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from difflib import SequenceMatcher | |
| from collections import Counter | |
| import matplotlib.pyplot as plt | |
| from io import BytesIO | |
| import base64 | |
| # تنظیم لاگگیری | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| # تنظیم API Key | |
| gemini_api_key = os.environ.get('GEMINI_API_KEY') | |
| if not gemini_api_key: | |
| raise ValueError("GOOGLE_API_KEY not found. Please set it in the Space settings.") | |
| genai.configure(api_key=gemini_api_key) | |
| def process_single_pdf(pdf_file): | |
| pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file | |
| logger.info(f"Starting to process file: {pdf_path}") | |
| if not os.path.isfile(pdf_path): | |
| logger.error(f"File {pdf_path} does not exist.") | |
| return None, None | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150) | |
| loader = PyPDFLoader(pdf_path) | |
| try: | |
| pages = loader.load_and_split() | |
| docs = text_splitter.split_documents(pages) | |
| sections = {"Introduction": [], "Methodology": [], "Results": [], "Discussion": [], "References": []} | |
| for doc in docs: | |
| text = doc.page_content | |
| if re.search(r"Introduction|مقدمه", text, re.I): | |
| sections["Introduction"].append(doc) | |
| elif re.search(r"Methodology|روش", text, re.I): | |
| sections["Methodology"].append(doc) | |
| elif re.search(r"Results|نتایج", text, re.I): | |
| sections["Results"].append(doc) | |
| elif re.search(r"Discussion|بحث", text, re.I): | |
| sections["Discussion"].append(doc) | |
| elif re.search(r"References|Bibliography|منابع", text, re.I): | |
| sections["References"].append(doc) | |
| logger.info(f"Processed file: {pdf_path} - Number of chunks: {len(docs)}") | |
| return docs, sections | |
| except Exception as e: | |
| logger.error(f"Error processing {pdf_path}: {str(e)}") | |
| return None, None | |
| def upload_and_process_pdf(pdf_files): | |
| if not pdf_files: | |
| return None, None, None, "Please upload at least one PDF file." | |
| logger.info(f"Number of input files: {len(pdf_files)}") | |
| all_docs = [] | |
| all_sections = {"Introduction": [], "Methodology": [], "Results": [], "Discussion": [], "References": []} | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| future_to_file = {executor.submit(process_single_pdf, pdf_file): pdf_file for pdf_file in pdf_files} | |
| for future in concurrent.futures.as_completed(future_to_file): | |
| docs, sections = future.result() | |
| if docs: | |
| all_docs.extend(docs) | |
| for key in all_sections: | |
| all_sections[key].extend(sections[key]) | |
| else: | |
| pdf_file = future_to_file[future] | |
| return None, None, None, f"Error processing file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}" | |
| logger.info(f"Total number of processed documents: {len(all_docs)}") | |
| return None, all_docs, all_sections, None | |
| def create_vector_db(docs): | |
| if not docs: | |
| return None, "No content was processed." | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=gemini_api_key) | |
| try: | |
| logger.info("Starting to build FAISS...") | |
| vector_store = FAISS.from_documents(docs, embedding=embeddings) | |
| logger.info(f"Vector database built with {len(docs)} documents.") | |
| return vector_store, None | |
| except Exception as e: | |
| logger.error(f"Error creating vector database: {str(e)}") | |
| return None, f"Error in vector processing: {str(e)}" | |
| def extract_keywords(text): | |
| try: | |
| prompt = f"Extract 5 main keywords from the following text that represent the main topic:\n**Text:**\n{text[:2000]}\n**Keywords:**" | |
| model = genai.GenerativeModel('gemini-pro') | |
| response = model.generate_content(prompt) | |
| keywords = response.text.split("**Keywords:**")[-1].strip().split(", ") | |
| logger.info(f"Extracted keywords: {keywords}") | |
| time.sleep(1) | |
| return keywords[:5] | |
| except Exception as e: | |
| logger.error(f"Error extracting keywords: {str(e)}") | |
| return ["research", "results", "method", "analysis", "topic"] | |
| def translate_to_english(text): | |
| try: | |
| prompt = f"Translate the following text to English:\n**Text:**\n{text[:1000]}\n**Translation:**" | |
| model = genai.GenerativeModel('gemini-pro') | |
| response = model.generate_content(prompt) | |
| translated_text = response.text.split("**Translation:**")[-1].strip() | |
| logger.info(f"Translated text: {translated_text[:50]}...") | |
| time.sleep(1) | |
| return translated_text | |
| except Exception as e: | |
| logger.error(f"Error in translation: {str(e)}") | |
| return text | |
| def check_plagiarism(text, language): | |
| try: | |
| keywords = extract_keywords(text) | |
| translated_keywords = translate_to_english(" ".join(keywords)) | |
| query = translated_keywords | |
| # Search in Google Scholar (free but limited) | |
| url_scholar = f"https://scholar.google.com/scholar?q={query}" | |
| response_scholar = requests.get(url_scholar, headers={"User-Agent": "Mozilla/5.0"}) | |
| soup_scholar = BeautifulSoup(response_scholar.text, 'html.parser') | |
| results_scholar = [] | |
| for item in soup_scholar.find_all('h3', class_='gs_rt', limit=5): | |
| title = item.get_text().strip() | |
| link = item.find('a')['href'] if item.find('a') else "No link available" | |
| author_info = item.find_next('div', class_='gs_a') | |
| if author_info: | |
| author_year = author_info.get_text().strip() | |
| author_match = re.search(r"(.+?)(?: - (\d{4}))?", author_year) | |
| author = author_match.group(1) if author_match.group(1) else "Unknown Author" | |
| year = author_match.group(2) if author_match.group(2) else "Unknown" | |
| else: | |
| author, year = "Unknown Author", "Unknown" | |
| results_scholar.append((title, link, author, year)) | |
| logger.info(f"Google Scholar results: {results_scholar}") | |
| # Search in arXiv (free) | |
| url_arxiv = f"https://arxiv.org/search/?query={query}&searchtype=all&source=header&start=0&max_results=10" | |
| response_arxiv = requests.get(url_arxiv, headers={"User-Agent": "Mozilla/5.0"}) | |
| soup_arxiv = BeautifulSoup(response_arxiv.text, 'html.parser') | |
| results_arxiv = [] | |
| for item in soup_arxiv.find_all('p', class_='title', limit=5): | |
| title = item.get_text().strip() | |
| link = item.find_previous('a', class_='arxiv-url')['href'] if item.find_previous('a', class_='arxiv-url') else "No link available" | |
| author_info = item.find_next('p', class_='authors') | |
| year_info = item.find_next('p', class_='is-size-7') | |
| author = author_info.get_text().replace("Authors:", "").strip() if author_info else "Unknown Author" | |
| year = re.search(r"\d{4}", year_info.get_text() if year_info else "").group(0) if re.search(r"\d{4}", year_info.get_text() if year_info else "") else "Unknown" | |
| results_arxiv.append((title, link, author, year)) | |
| logger.info(f"arXiv results: {results_arxiv}") | |
| all_results = results_scholar + results_arxiv | |
| if not all_results: | |
| return "No significant similarity found.\n**Explanation:** Your text was compared with scientific resources in Google Scholar and arXiv, and no meaningful matches were found.\n**Status:** Plagiarism likelihood is very low." if language == "English" else "هیچ تشابه قابل توجهی یافت نشد.\n**توضیح:** متن شما با منابع علمی موجود در Google Scholar و arXiv مقایسه شد و هیچ تطابقی معناداری پیدا نشد.\n**وضعیت:** احتمال سرقت ادبی بسیار پایین است." | |
| max_similarity = 0 | |
| matched_texts = [] | |
| for title, link, author, year in all_results: | |
| similarity = SequenceMatcher(None, text[:1000], title).ratio() | |
| if similarity > 0.1: # Minimum 10% similarity for display | |
| matched_texts.append(f"**Title:** {title}\n**Author:** {author}\n**Year:** {year}\n**Link:** {link}\n**Note:** This resource may have some similarity with your text." if language == "English" else f"**عنوان:** {title}\n**نویسنده:** {author}\n**سال:** {year}\n**لینک:** {link}\n**توضیح:** این منبع ممکن است بخشی از متن شما را مشابه داشته باشد.") | |
| if similarity > max_similarity: | |
| max_similarity = similarity | |
| time.sleep(1) | |
| similarity_percent = max_similarity * 100 | |
| if not matched_texts: | |
| return "No significant similarity found.\n**Explanation:** Your text was compared with scientific resources and no matches were found.\n**Status:** Plagiarism likelihood is very low." if language == "English" else "هیچ تشابه قابل توجهی یافت نشد.\n**توضیح:** متن شما با منابع علمی مقایسه شد و تطابقی پیدا نشد.\n**وضعیت:** احتمال سرقت ادبی بسیار پایین است." | |
| if similarity_percent > 20: | |
| status = "Plagiarism is likely. Please review similar resources and add appropriate citations." if language == "English" else "احتمال سرقت ادبی وجود دارد. لطفاً منابع مشابه را بررسی کنید و ارجاع مناسب اضافه کنید." | |
| elif similarity_percent > 10: | |
| status = "Low similarity. Possibly coincidental, but reviewing resources is recommended." if language == "English" else "تشابه کم. احتمالاً تصادفی است، اما مرور منابع توصیه میشود." | |
| else: | |
| status = "Very low similarity. Plagiarism likelihood is negligible." if language == "English" else "تشابه بسیار کم. احتمال سرقت ادبی ناچیز است." | |
| output = (f"**Similarity Percentage:** {similarity_percent:.2f}%\n" | |
| f"**Status:** {status}\n" | |
| f"**Similar Resources Found:**\n" + "\n--------------------\n".join(matched_texts[:3])) | |
| return output | |
| except Exception as e: | |
| logger.error(f"Error in plagiarism check: {str(e)}") | |
| return f"Error in plagiarism check: {str(e)}\nPlease try again or contact support." if language == "English" else f"خطا در بررسی سرقت ادبی: {str(e)}\nلطفاً دوباره امتحان کنید یا با پشتیبانی تماس بگیرید." | |
| def suggest_resources(text, language): | |
| try: | |
| keywords = extract_keywords(text) | |
| translated_text = translate_to_english(" ".join(keywords)) | |
| query = translated_text | |
| # Search in arXiv (free) | |
| url_arxiv = f"https://arxiv.org/search/?query={query}&searchtype=all&source=header&start=0&max_results=10" | |
| response_arxiv = requests.get(url_arxiv, headers={"User-Agent": "Mozilla/5.0"}) | |
| soup_arxiv = BeautifulSoup(response_arxiv.text, 'html.parser') | |
| papers_arxiv = [] | |
| for paper in soup_arxiv.find_all('p', class_='title', limit=5): | |
| title = paper.get_text().strip() | |
| link = paper.find_previous('a', class_='arxiv-url')['href'] if paper.find_previous('a', class_='arxiv-url') else "No link available" | |
| papers_arxiv.append(f"{title} (Link: {link})") | |
| time.sleep(1) | |
| return papers_arxiv if papers_arxiv else ["No resources found."] if language == "English" else ["منبعی یافت نشد."] | |
| except Exception as e: | |
| logger.error(f"Error in suggesting resources: {str(e)}") | |
| return ["Error in resource search"] if language == "English" else ["خطا در جستجوی منابع"] | |
| def evaluate_quality(docs, sections, language): | |
| text = " ".join([doc.page_content for doc in docs]) | |
| score = 0 | |
| explanation = [] | |
| suggestions = [] | |
| auto_fix = "" | |
| # Criterion 1: References (Quality and Quantity) | |
| ref_count = len(re.findall(r"\[\d+\]|[A-Za-z]+\s+\d{4}", text)) | |
| if ref_count > 15: | |
| score += 30 | |
| explanation.append("Very strong and credible references (more than 15 citations from reputable journals).") | |
| elif ref_count > 10: | |
| score += 25 | |
| explanation.append("Sufficient and credible references (10-15 citations).") | |
| elif ref_count > 0: | |
| score += 15 | |
| explanation.append("Existing but limited references (fewer than 10 citations).") | |
| suggestions.append("Add at least 5 sources from reputable journals (like IEEE, Springer, or Elsevier) with precise author and year citations.") | |
| else: | |
| explanation.append("No sufficient references found.") | |
| suggestions.append("Complete the references section with at least 10 citations from peer-reviewed articles.") | |
| auto_fix += "\n**Auto-fix - Sample Citation:**\n[1] Smith, J. (2020). 'Advanced Research Methods', Journal of Science, 15(3), 123-145." | |
| # Criterion 2: Coherence, Writing, and Scientific Weight | |
| words = text.split() | |
| word_freq = Counter(words).most_common(10) | |
| keywords = [word[0] for word in word_freq[:3]] if word_freq else ["research", "results", "method"] | |
| scientific_terms = sum(1 for word in words if word.lower() in ["analysis", "data", "method", "result", "hypothesis", "theory"]) | |
| if word_freq and word_freq[0][1] > len(words) * 0.02 and scientific_terms > len(words) * 0.05: | |
| score += 25 | |
| explanation.append("Excellent textual coherence and high scientific weight (focus on topic and use of scientific terms).") | |
| else: | |
| explanation.append("Poor textual coherence or low scientific weight (topic dispersion or lack of scientific terms).") | |
| suggestions.append(f"Use keywords like {', '.join(keywords)} and scientific terms (like 'statistical analysis' or 'hypothesis') more frequently and make sentences smoother.") | |
| try: | |
| prompt = f"Rewrite the following paragraph to be more scientific, smoother, and with higher scientific weight:\n**Text:**\n{text[:500]}\n**Rewritten:**" | |
| model = genai.GenerativeModel('gemini-pro') | |
| response = model.generate_content(prompt) | |
| auto_fix += f"\n**Auto-fix - Rewritten Paragraph:**\n{response.text.split('**Rewritten:**')[-1].strip()}" | |
| time.sleep(1) | |
| except Exception as e: | |
| logger.error(f"Error in rewriting: {str(e)}") | |
| auto_fix += "\n**Auto-fix - Rewritten:**\nError in rewriting, please manually revise the text." | |
| # Criterion 3: Tables/Figures | |
| if re.search(r"Table|Figure|جدول|شکل", text, re.I): | |
| score += 20 | |
| explanation.append("Effective use of tables or figures to support findings.") | |
| else: | |
| explanation.append("No use of tables or figures.") | |
| suggestions.append("Add a table for data and a figure (like a bar chart or line graph) for trends to make findings more comprehensible.") | |
| # Criterion 4: Depth of Analysis and Scientific Weight | |
| analysis_text = " ".join([doc.page_content for doc in sections.get("Results", []) + sections.get("Discussion", [])]) | |
| stats_found = bool(re.search(r"Statistic|Regression|ANOVA|T-test|Correlation|آمار", analysis_text, re.I)) | |
| if len(analysis_text.split()) > 1500 and stats_found: | |
| score += 25 | |
| explanation.append("Very high depth of analysis (long and statistical with strong scientific weight).") | |
| elif len(analysis_text.split()) > 1000: | |
| score += 15 | |
| explanation.append("Acceptable depth of analysis (long but lacking sufficient statistical analysis).") | |
| suggestions.append("Add advanced statistical analysis (like regression, ANOVA, or T-test) to strengthen findings.") | |
| else: | |
| explanation.append("Poor depth of analysis (short and without statistical analysis).") | |
| suggestions.append("Expand the Results/Discussion section to at least 1500 words with comprehensive statistical analysis.") | |
| # Criterion 5: IMRAD Structure (Advanced Academic Quality) | |
| imrad_structure = {"Introduction": 0, "Methodology": 0, "Results": 0, "Discussion": 0} | |
| for section_name in imrad_structure.keys(): | |
| if sections.get(section_name) and len(sections[section_name]) > 0: | |
| imrad_structure[section_name] = 1 | |
| imrad_score = sum(imrad_structure.values()) * 5 # هر بخش 5 امتیاز | |
| score += imrad_score | |
| if imrad_score == 20: | |
| explanation.append("Complete IMRAD structure present (Introduction, Methodology, Results, Discussion).") | |
| elif imrad_score > 0: | |
| explanation.append(f"Partial IMRAD structure present (missing {4 - sum(imrad_structure.values())} sections).") | |
| suggestions.append("Ensure all sections of IMRAD (Introduction, Methodology, Results, Discussion) are included for a complete academic structure.") | |
| else: | |
| explanation.append("No IMRAD structure detected.") | |
| suggestions.append("Structure your document following the IMRAD format (Introduction, Methodology, Results, Discussion) for better academic quality.") | |
| # Criterion 6: Word Count (Basic Length Check) | |
| word_count = len(words) | |
| if word_count > 5000: | |
| score += 10 | |
| explanation.append("Document length is excellent (over 5000 words).") | |
| elif word_count > 3000: | |
| score += 7 | |
| explanation.append("Document length is good (3000-5000 words).") | |
| else: | |
| explanation.append("Document length is short (less than 3000 words).") | |
| suggestions.append("Expand the document to at least 3000 words for better academic depth.") | |
| # Measure scientific weight by global comparison | |
| try: | |
| keywords = extract_keywords(text) | |
| translated_text = translate_to_english(" ".join(keywords)) | |
| query = translated_text | |
| url_arxiv = f"https://arxiv.org/search/?query={query}&searchtype=all&source=header" | |
| response_arxiv = requests.get(url_arxiv, headers={"User-Agent": "Mozilla/5.0"}) | |
| soup_arxiv = BeautifulSoup(response_arxiv.text, 'html.parser') | |
| arxiv_titles = [paper.get_text().strip() for paper in soup_arxiv.find_all('p', class_='title')[:3]] | |
| if arxiv_titles: | |
| suggestions.append(f"To increase scientific weight, refer to similar arXiv papers like '{arxiv_titles[0]}' and compare your findings with them.") | |
| time.sleep(1) | |
| except Exception as e: | |
| logger.error(f"Error in scientific weight assessment: {str(e)}") | |
| suggestions.append("Global comparison with scientific resources failed due to an error.") | |
| score = max(min(score, 100), 0) | |
| if language == "Farsi": | |
| # تبدیل توضیحات و پیشنهادات به فارسی | |
| explanation = "; ".join([ | |
| "منابع بسیار قوی و قابل استناد (بیش از 15 ارجاع از مجلات معتبر)" if x == "Very strong and credible references (more than 15 citations from reputable journals)." else | |
| "منابع کافی و قابل استناد (10-15 ارجاع)" if x == "Sufficient and credible references (10-15 citations)." else | |
| "منابع موجود اما محدود (کمتر از 10 ارجاع)" if x == "Existing but limited references (fewer than 10 citations)." else | |
| "منابع کافی یافت نشد" if x == "No sufficient references found." else | |
| "انسجام متنی عالی و بار علمی بالا (تمرکز روی موضوع و استفاده از اصطلاحات علمی)" if x == "Excellent textual coherence and high scientific weight (focus on topic and use of scientific terms)." else | |
| "انسجام متنی ضعیف یا بار علمی پایین (پراکندگی موضوعی یا کمبود اصطلاحات علمی)" if x == "Poor textual coherence or low scientific weight (topic dispersion or lack of scientific terms)." else | |
| "استفاده مؤثر از جداول یا شکلها برای پشتیبانی یافتهها" if x == "Effective use of tables or figures to support findings." else | |
| "عدم استفاده از جداول یا شکلها" if x == "No use of tables or figures." else | |
| "عمق تحلیل بسیار بالا (تحلیل طولانی و آماری با بار علمی قوی)" if x == "Very high depth of analysis (long and statistical with strong scientific weight)." else | |
| "عمق تحلیل قابل قبول (طولانی اما بدون تحلیل آماری کافی)" if x == "Acceptable depth of analysis (long but lacking sufficient statistical analysis)." else | |
| "عمق تحلیل ضعیف (کوتاه و بدون تحلیل آماری)" if x == "Poor depth of analysis (short and without statistical analysis)." else | |
| "ساختار کامل IMRAD موجود است (مقدمه، روششناسی، نتایج، بحث)" if x == "Complete IMRAD structure present (Introduction, Methodology, Results, Discussion)." else | |
| f"ساختار ناقص IMRAD موجود است (بخشهای گمشده {4 - sum(imrad_structure.values())} بخش)" if x.startswith("Partial IMRAD structure present") else | |
| "هیچ ساختاری از IMRAD تشخیص داده نشد" if x == "No IMRAD structure detected." else | |
| "طول سند عالی است (بیش از 5000 کلمه)" if x == "Document length is excellent (over 5000 words)." else | |
| "طول سند خوب است (3000-5000 کلمه)" if x == "Document length is good (3000-5000 words)." else | |
| "طول سند کوتاه است (کمتر از 3000 کلمه)" if x == "Document length is short (less than 3000 words)." else | |
| x for x in explanation | |
| ]) | |
| suggestions = "; ".join([ | |
| "حداقل 5 منبع از مجلات معتبر (مثل IEEE، Springer، یا Elsevier) با ذکر دقیق نویسنده و سال اضافه کنید" if x == "Add at least 5 sources from reputable journals (like IEEE, Springer, or Elsevier) with precise author and year citations." else | |
| "بخش منابع را با حداقل 10 ارجاع از مقالات Peer-Reviewed تکمیل کنید" if x == "Complete the references section with at least 10 citations from peer-reviewed articles." else | |
| f"از کلمات کلیدی مثل {', '.join(keywords)} و اصطلاحات علمی (مثل 'تحلیل آماری' یا 'فرضیه') بیشتر استفاده کنید و جملات را روانتر کنید" if x.startswith("Use keywords like") else | |
| "یک جدول برای دادهها و یک شکل (مثل نمودار میلهای یا خطی) برای روندها اضافه کنید تا یافتهها قابلفهمتر شوند" if x == "Add a table for data and a figure (like a bar chart or line graph) for trends to make findings more comprehensible." else | |
| "تحلیل آماری پیشرفته (مثل رگرسیون، ANOVA، یا T-test) برای تقویت یافتهها اضافه کنید" if x == "Add advanced statistical analysis (like regression, ANOVA, or T-test) to strengthen findings." else | |
| "بخش نتایج/بحث را با حداقل 1500 کلمه و تحلیل آماری جامع گسترش دهید" if x == "Expand the Results/Discussion section to at least 1500 words with comprehensive statistical analysis." else | |
| "برای افزایش بار علمی، به مقالات مشابه در arXiv مثل '...' رجوع کنید و یافتههای خود را با آنها مقایسه کنید" if x.startswith("To increase scientific weight") else | |
| "مقایسه با منابع علمی جهانی به دلیل خطا انجام نشد" if x == "Global comparison with scientific resources failed due to an error." else | |
| "تمام بخشهای IMRAD (مقدمه، روششناسی، نتایج، بحث) را برای ساختار آکادمیک کامل شامل کنید" if x == "Ensure all sections of IMRAD (Introduction, Methodology, Results, Discussion) are included for a complete academic structure." else | |
| "سند را به حداقل 3000 کلمه گسترش دهید تا عمق آکادمیک بیشتری داشته باشد" if x == "Expand the document to at least 3000 words for better academic depth." else | |
| x for x in suggestions | |
| ]) | |
| return score, explanation, suggestions, auto_fix | |
| def generate_visualization(docs, language): | |
| if not docs: | |
| return "No data available for visualization.", None | |
| # استخراج کلمات کلیدی از تمام اسناد | |
| all_text = " ".join([doc.page_content for doc in docs]) | |
| keywords = extract_keywords(all_text) | |
| # ایجاد نمودار میلهای برای کلمات کلیدی | |
| plt.figure(figsize=(10, 6)) | |
| plt.bar(keywords, [1] * len(keywords)) # سادهسازی با ارتفاع ثابت برای نمایش | |
| plt.title("Top Keywords from Document" if language == "English" else "کلمات کلیدی اصلی از سند") | |
| plt.xlabel("Keywords" if language == "English" else "کلمات کلیدی") | |
| plt.ylabel("Frequency" if language == "English" else "تعداد") | |
| # ذخیره نمودار بهعنوان تصویر باینری | |
| buffer = BytesIO() | |
| plt.savefig(buffer, format='png', bbox_inches='tight') | |
| buffer.seek(0) | |
| image_png = buffer.getvalue() | |
| buffer.close() | |
| # تبدیل تصویر به Base64 و نمایش در Gradio | |
| image_base64 = base64.b64encode(image_png).decode('utf-8') | |
| return "Here is a visualization of the top keywords from your document." if language == "English" else "این یک نمایش بصری از کلمات کلیدی اصلی سند شماست.", f'<img src="data:image/png;base64,{image_base64}" style="max-width: 100%; height: auto; display: block;">' | |
| llm_gemini = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=gemini_api_key, convert_system_message_to_human=True, temperature=0.5) | |
| def academic_chatbot(pdf_file, mode, query, language, detail_level, section_dropdown, visualize=False): | |
| start_time = time.time() | |
| logger.info(f"Starting processing - Mode: {mode}, Question: {query}, Language: {language}, Detail: {detail_level}, Section: {section_dropdown}, Visualize: {visualize}") | |
| if mode != "Standard Response" and not pdf_file: | |
| return "Please upload at least one PDF file." | |
| if mode == "Standard Response": | |
| chain = create_conversation_chain(None, None, mode, language, detail_level) | |
| try: | |
| result = chain.invoke({"question": query})["text"] | |
| return result + f"\n\n⏱ Processing time: {time.time() - start_time:.2f} seconds" | |
| except Exception as e: | |
| logger.error(f"Error in standard processing: {str(e)}") | |
| return f"Error: {str(e)}" | |
| pdf_files = pdf_file if isinstance(pdf_file, list) else [pdf_file] | |
| _, docs, sections, error = upload_and_process_pdf(pdf_files) | |
| if error: | |
| return error | |
| target_docs = docs if section_dropdown == "Entire Document" else sections.get(section_dropdown, docs) | |
| context = " ".join([doc.page_content for doc in target_docs]) | |
| vector_store = None | |
| if mode in ["Academic Analysis (RAG)", "Plagiarism Check", "Quality Evaluation"]: | |
| vector_store, vectordb_error = create_vector_db(target_docs) | |
| if vectordb_error: | |
| return vectordb_error | |
| chain = create_conversation_chain(vector_store, target_docs, mode, language, detail_level, section_dropdown) | |
| try: | |
| if mode == "Auto Summary": | |
| time.sleep(2) | |
| result = chain.invoke({"context": context[:5000]})["text"] | |
| elif mode == "Plagiarism Check": | |
| plagiarism_result = check_plagiarism(context, language) | |
| result = plagiarism_result | |
| elif mode == "Quality Evaluation": | |
| score, explanation, suggestions, auto_fix = evaluate_quality(target_docs, sections, language) | |
| time.sleep(2) | |
| result = chain.invoke({"context": context[:5000], "score": score, "explanation": explanation, "suggestions": suggestions})["text"] + auto_fix | |
| else: | |
| result = chain.invoke({"question": query, "chat_history": []})["answer"] | |
| if mode not in ["Plagiarism Check", "Quality Evaluation"]: | |
| resources = suggest_resources(context, language) | |
| result += "\n\n**Suggested Resources:**\n" + "\n".join(resources) if language == "English" else "\n\n**منابع پیشنهادی:**\n" + "\n".join(resources) | |
| # اگر گزینه بصریسازی فعال باشه، نمودار رو اضافه کن | |
| if visualize and mode in ["Quality Evaluation", "Auto Summary", "Academic Analysis (RAG)"]: | |
| viz_text, viz_image = generate_visualization(target_docs, language) | |
| result += f"\n\n{viz_text}\n{viz_image}" | |
| return result + f"\n\n⏱ Processing time: {time.time() - start_time:.2f} seconds" | |
| except Exception as e: | |
| logger.error(f"Error in processing: {str(e)}") | |
| if "429" in str(e): | |
| return "Error: Rate limit exceeded for Gemini API. Please wait a few minutes and try again." if language == "English" else "خطا: محدودیت درخواست به API Gemini. لطفاً چند دقیقه صبر کنید و دوباره امتحان کنید." | |
| return f"Error: {str(e)}" if language == "English" else f"خطا: {str(e)}" | |
| academic_analysis_prompt = PromptTemplate( | |
| template="""You are a professional academic analyst. Provide a deep and structured analysis of {section}: | |
| 1. Based solely on the provided text. | |
| 2. Including a review of the topic, methods, findings, and critique (if applicable). | |
| 3. In {language} with {detail_level} detail. | |
| **Related Text:** | |
| {context} | |
| **User Question:** {question} | |
| **Academic Analysis:**""", | |
| input_variables=["section", "context", "question", "language", "detail_level"] | |
| ) | |
| summary_prompt = PromptTemplate( | |
| template="""You are an expert in academic writing. Produce a structured scientific summary (200-300 words) of the following text in {language} that includes: | |
| 1. Research objective | |
| 2. Methodology | |
| 3. Main findings | |
| 4. Conclusion | |
| **Text:** | |
| {context} | |
| **Summary:**""", | |
| input_variables=["context", "language"] | |
| ) | |
| general_qa_prompt = PromptTemplate( | |
| template="""You are an intelligent assistant. Answer the user's question in {language}: | |
| **User Question:** {question} | |
| Answer:""", | |
| input_variables=["question", "language"] | |
| ) | |
| plagiarism_prompt = PromptTemplate( | |
| template="""Report the percentage of similarity of the following text with English resources: | |
| **Text:** | |
| {context} | |
| **Result:** {similarity}""", | |
| input_variables=["context", "similarity"] | |
| ) | |
| quality_prompt = PromptTemplate( | |
| template="""You are a professional academic evaluator. Evaluate the scientific quality of the following text: | |
| **Text:** | |
| {context} | |
| **Score:** {score}/100 | |
| **Explanations:** {explanation} | |
| **Improvement Suggestions:** {suggestions}""", | |
| input_variables=["context", "score", "explanation", "suggestions"] | |
| ) | |
| def create_conversation_chain(vector_store, docs, mode, language, detail_level, section=None): | |
| if mode == "Academic Analysis (RAG)": | |
| retriever = vector_store.as_retriever(search_kwargs={"k": 3}) | |
| chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm_gemini, | |
| retriever=retriever, | |
| return_source_documents=True, | |
| combine_docs_chain_kwargs={"prompt": academic_analysis_prompt.partial(language=language, detail_level=detail_level, section=section or "Entire Document")}, | |
| verbose=True | |
| ) | |
| elif mode == "Auto Summary": | |
| chain = LLMChain(llm=llm_gemini, prompt=summary_prompt.partial(language=language)) | |
| elif mode == "Plagiarism Check": | |
| chain = LLMChain(llm=llm_gemini, prompt=plagiarism_prompt.partial(language=language)) | |
| elif mode == "Quality Evaluation": | |
| chain = LLMChain(llm=llm_gemini, prompt=quality_prompt.partial(language=language)) | |
| else: | |
| chain = LLMChain(llm=llm_gemini, prompt=general_qa_prompt.partial(language=language)) | |
| return chain | |
| if __name__ == "__main__": | |
| with gr.Blocks(title="Professional Thesis Analyzer with Gemini") as iface: | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("# Professional Thesis Analyzer with Gemini") | |
| gr.Markdown("Upload your PDF file and use the analysis, summary, plagiarism check, or quality evaluation features.") | |
| pdf_input = gr.File(file_types=['.pdf'], label="Upload PDF File", file_count="multiple") | |
| mode = gr.Radio( | |
| ["Academic Analysis (RAG)", "Auto Summary", "Plagiarism Check", "Quality Evaluation", "Standard Response"], | |
| label="Processing Mode", | |
| value="Academic Analysis (RAG)" | |
| ) | |
| query = gr.Textbox(lines=3, placeholder="Enter your question or request here...", label="Question or Request") | |
| section = gr.Dropdown(["Entire Document", "Introduction", "Methodology", "Results", "Discussion", "References"], label="Target Section", value="Entire Document") | |
| language_dropdown = gr.Dropdown(["English", "Farsi"], label="Response Language", value="English", interactive=True) | |
| detail = gr.Dropdown(["Brief", "Detailed"], label="Detail Level", value="Detailed") | |
| visualize = gr.Checkbox(label="Generate Visualization", value=False) | |
| submit = gr.Button("Submit") | |
| with gr.Column(): | |
| output = gr.Textbox(label="Processing Result", lines=15, placeholder="Results will be displayed here...") | |
| submit.click( | |
| fn=academic_chatbot, | |
| inputs=[pdf_input, mode, query, language_dropdown, detail, section, visualize], | |
| outputs=output | |
| ) | |
| iface.launch() | |