Spaces:

SalwaM
/

PDF_Research_Assistant

Sleeping

App Files Files Community

PDF_Research_Assistant / app.py

SalwaM

Update app.py

469e238 verified 3 months ago

raw

history blame contribute delete

6.79 kB

	import gradio as gr
	import chromadb
	from chromadb.config import Settings
	from sentence_transformers import SentenceTransformer
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from groq import Groq
	import pdfplumber
	import os


	# تهيئة المكونات

	api_key_coder= os.environ.get('api_key_coder')
	embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

	client = chromadb.Client(Settings(
	persist_directory="rag_db",
	anonymized_telemetry=False
	))

	collection = client.get_or_create_collection(
	name="pdf_collection",
	metadata={"hnsw:space": "cosine"}
	)

	groq_client = Groq(api_key=api_key_coder)

	def extract_text_from_pdf(pdf_file):
	"""استخراج النص من ملف PDF"""
	text = ""
	try:
	# إذا كان الملف هو كائن Gradio (له خاصية name)
	file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file

	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return f"Error: Could not extract text from PDF. {str(e)}"

	return text

	def clear_collection():
	"""مسح المجموعة السابقة قبل إضافة مستندات جديدة"""
	try:
	client.delete_collection("pdf_collection")
	except:
	pass

	global collection
	collection = client.create_collection(
	name="pdf_collection",
	metadata={"hnsw:space": "cosine"}
	)

	def answer_from_pdf(pdf_file, question):
	"""معالجة PDF والإجابة على السؤال"""
	if pdf_file is None:
	return "⚠️ Please upload a PDF file first."

	if not question or question.strip() == "":
	return "⚠️ Please enter a question."

	try:
	# مسح المجموعة القديمة
	clear_collection()

	# استخراج النص من PDF
	text = extract_text_from_pdf(pdf_file)

	if text.startswith("Error:"):
	return text

	if len(text.strip()) == 0:
	return "⚠️ Could not extract any text from the PDF. The file might be scanned or encrypted."

	# تقسيم النص إلى أجزاء
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=800,
	chunk_overlap=150
	)

	chunks = splitter.split_text(text)

	# إنشاء embeddings
	embeddings = embedding_model.encode(chunks).tolist()

	# إضافة إلى قاعدة البيانات
	collection.add(
	documents=chunks,
	embeddings=embeddings,
	ids=[f"chunk_{i}" for i in range(len(chunks))]
	)

	# البحث عن الأجزاء الأكثر صلة
	query_embedding = embedding_model.encode(question).tolist()

	docs = collection.query(
	query_embeddings=[query_embedding],
	n_results=3
	)

	# تجميع السياق
	context = "\n".join(docs["documents"][0]) if docs["documents"] else "No relevant context found."

	# إنشاء الرد باستخدام Groq
	prompt = f"""You are a research assistant. Answer the question based ONLY on the provided context.

	Context from the document:
	{context}

	Question: {question}

	Instructions:
	1. Answer based ONLY on the information in the context above.
	2. If the context doesn't contain relevant information, say "The document doesn't contain information about this."
	3. Be clear and concise.
	4. Provide page references if available.
	5. Use bullet points for lists when appropriate.
	"""

	response = groq_client.chat.completions.create(
	model="meta-llama/llama-4-scout-17b-16e-instruct", # يمكنك تغيير النموذج إذا أردت
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1,
	max_tokens=500
	)

	answer = response.choices[0].message.content
	return answer

	except Exception as e:
	return f"❌ An error occurred: {str(e)}"

	# إنشاء واجهة Gradio
	examples = [
	[None, "What is the main idea of this document?"],
	[None, "Summarize the content briefly."],
	[None, "What methodology is used in this paper?"],
	[None, "What are the key results presented by the author?"],
	[None, "Explain any important equations or algorithms mentioned."],
	[None, "What are the strengths and weaknesses of this work?"],
	[None, "Does the paper discuss related or previous work?"],
	[None, "What practical applications are proposed?"]
	]

	# إصدار باللغة الإنجليزية
	interface_en = gr.Interface(
	fn=answer_from_pdf,
	inputs=[
	gr.File(
	label="📄 Upload PDF",
	file_types=[".pdf"],
	type="filepath" # هذا يضمن تمرير مسار الملف
	),
	gr.Textbox(
	label="❓ Question",
	lines=2,
	placeholder="Type your question about the PDF content here..."
	)
	],
	outputs=gr.Textbox(
	label="✅ Answer",
	lines=10
	),
	title="📚 PDF Research Assistant",
	description="Upload a PDF file and ask any question related to its content. The system will extract text and provide answers based on the document.",
	examples=examples,
	theme=gr.themes.Soft()
	)

	# إصدار باللغة العربية
	interface_ar = gr.Interface(
	fn=answer_from_pdf,
	inputs=[
	gr.File(
	label="📄 ارفع ملف PDF",
	file_types=[".pdf"],
	type="filepath"
	),
	gr.Textbox(
	label="❓ السؤال",
	lines=2,
	placeholder="اكتب سؤالك حول محتوى ملف PDF هنا..."
	)
	],
	outputs=gr.Textbox(
	label="✅ الإجابة",
	lines=10
	),
	title="📚 مساعد البحث في ملفات PDF",
	description="ارفع ملف PDF واسأل أي سؤال متعلق بمحتواه. سيقوم النظام باستخراج النص وتقديم إجابات بناءً على المستند.",
	examples=examples,
	theme=gr.themes.Soft()
	)

	# إنشاء تبويبات للواجهتين
	demo = gr.TabbedInterface(
	[interface_en, interface_ar],
	["English Version", "النسخة العربية"]
	)

	# تشغيل التطبيق
	if __name__ == "__main__":
	demo.launch(
	share=True,
	debug=False,
	server_name="0.0.0.0",
	server_port=7860
	)