SalwaM's picture
Update app.py
469e238 verified
import gradio as gr
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from groq import Groq
import pdfplumber
import os
# تهيئة المكونات
api_key_coder= os.environ.get('api_key_coder')
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client(Settings(
persist_directory="rag_db",
anonymized_telemetry=False
))
collection = client.get_or_create_collection(
name="pdf_collection",
metadata={"hnsw:space": "cosine"}
)
groq_client = Groq(api_key=api_key_coder)
def extract_text_from_pdf(pdf_file):
"""استخراج النص من ملف PDF"""
text = ""
try:
# إذا كان الملف هو كائن Gradio (له خاصية name)
file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return f"Error: Could not extract text from PDF. {str(e)}"
return text
def clear_collection():
"""مسح المجموعة السابقة قبل إضافة مستندات جديدة"""
try:
client.delete_collection("pdf_collection")
except:
pass
global collection
collection = client.create_collection(
name="pdf_collection",
metadata={"hnsw:space": "cosine"}
)
def answer_from_pdf(pdf_file, question):
"""معالجة PDF والإجابة على السؤال"""
if pdf_file is None:
return "⚠️ Please upload a PDF file first."
if not question or question.strip() == "":
return "⚠️ Please enter a question."
try:
# مسح المجموعة القديمة
clear_collection()
# استخراج النص من PDF
text = extract_text_from_pdf(pdf_file)
if text.startswith("Error:"):
return text
if len(text.strip()) == 0:
return "⚠️ Could not extract any text from the PDF. The file might be scanned or encrypted."
# تقسيم النص إلى أجزاء
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=150
)
chunks = splitter.split_text(text)
# إنشاء embeddings
embeddings = embedding_model.encode(chunks).tolist()
# إضافة إلى قاعدة البيانات
collection.add(
documents=chunks,
embeddings=embeddings,
ids=[f"chunk_{i}" for i in range(len(chunks))]
)
# البحث عن الأجزاء الأكثر صلة
query_embedding = embedding_model.encode(question).tolist()
docs = collection.query(
query_embeddings=[query_embedding],
n_results=3
)
# تجميع السياق
context = "\n".join(docs["documents"][0]) if docs["documents"] else "No relevant context found."
# إنشاء الرد باستخدام Groq
prompt = f"""You are a research assistant. Answer the question based ONLY on the provided context.
Context from the document:
{context}
Question: {question}
Instructions:
1. Answer based ONLY on the information in the context above.
2. If the context doesn't contain relevant information, say "The document doesn't contain information about this."
3. Be clear and concise.
4. Provide page references if available.
5. Use bullet points for lists when appropriate.
"""
response = groq_client.chat.completions.create(
model="meta-llama/llama-4-scout-17b-16e-instruct", # يمكنك تغيير النموذج إذا أردت
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=500
)
answer = response.choices[0].message.content
return answer
except Exception as e:
return f"❌ An error occurred: {str(e)}"
# إنشاء واجهة Gradio
examples = [
[None, "What is the main idea of this document?"],
[None, "Summarize the content briefly."],
[None, "What methodology is used in this paper?"],
[None, "What are the key results presented by the author?"],
[None, "Explain any important equations or algorithms mentioned."],
[None, "What are the strengths and weaknesses of this work?"],
[None, "Does the paper discuss related or previous work?"],
[None, "What practical applications are proposed?"]
]
# إصدار باللغة الإنجليزية
interface_en = gr.Interface(
fn=answer_from_pdf,
inputs=[
gr.File(
label="📄 Upload PDF",
file_types=[".pdf"],
type="filepath" # هذا يضمن تمرير مسار الملف
),
gr.Textbox(
label="❓ Question",
lines=2,
placeholder="Type your question about the PDF content here..."
)
],
outputs=gr.Textbox(
label="✅ Answer",
lines=10
),
title="📚 PDF Research Assistant",
description="Upload a PDF file and ask any question related to its content. The system will extract text and provide answers based on the document.",
examples=examples,
theme=gr.themes.Soft()
)
# إصدار باللغة العربية
interface_ar = gr.Interface(
fn=answer_from_pdf,
inputs=[
gr.File(
label="📄 ارفع ملف PDF",
file_types=[".pdf"],
type="filepath"
),
gr.Textbox(
label="❓ السؤال",
lines=2,
placeholder="اكتب سؤالك حول محتوى ملف PDF هنا..."
)
],
outputs=gr.Textbox(
label="✅ الإجابة",
lines=10
),
title="📚 مساعد البحث في ملفات PDF",
description="ارفع ملف PDF واسأل أي سؤال متعلق بمحتواه. سيقوم النظام باستخراج النص وتقديم إجابات بناءً على المستند.",
examples=examples,
theme=gr.themes.Soft()
)
# إنشاء تبويبات للواجهتين
demo = gr.TabbedInterface(
[interface_en, interface_ar],
["English Version", "النسخة العربية"]
)
# تشغيل التطبيق
if __name__ == "__main__":
demo.launch(
share=True,
debug=False,
server_name="0.0.0.0",
server_port=7860
)