Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import chromadb | |
| from chromadb.config import Settings | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from groq import Groq | |
| import pdfplumber | |
| import os | |
| # تهيئة المكونات | |
| api_key_coder= os.environ.get('api_key_coder') | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| client = chromadb.Client(Settings( | |
| persist_directory="rag_db", | |
| anonymized_telemetry=False | |
| )) | |
| collection = client.get_or_create_collection( | |
| name="pdf_collection", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| groq_client = Groq(api_key=api_key_coder) | |
| def extract_text_from_pdf(pdf_file): | |
| """استخراج النص من ملف PDF""" | |
| text = "" | |
| try: | |
| # إذا كان الملف هو كائن Gradio (له خاصية name) | |
| file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| except Exception as e: | |
| print(f"Error extracting text from PDF: {e}") | |
| return f"Error: Could not extract text from PDF. {str(e)}" | |
| return text | |
| def clear_collection(): | |
| """مسح المجموعة السابقة قبل إضافة مستندات جديدة""" | |
| try: | |
| client.delete_collection("pdf_collection") | |
| except: | |
| pass | |
| global collection | |
| collection = client.create_collection( | |
| name="pdf_collection", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| def answer_from_pdf(pdf_file, question): | |
| """معالجة PDF والإجابة على السؤال""" | |
| if pdf_file is None: | |
| return "⚠️ Please upload a PDF file first." | |
| if not question or question.strip() == "": | |
| return "⚠️ Please enter a question." | |
| try: | |
| # مسح المجموعة القديمة | |
| clear_collection() | |
| # استخراج النص من PDF | |
| text = extract_text_from_pdf(pdf_file) | |
| if text.startswith("Error:"): | |
| return text | |
| if len(text.strip()) == 0: | |
| return "⚠️ Could not extract any text from the PDF. The file might be scanned or encrypted." | |
| # تقسيم النص إلى أجزاء | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, | |
| chunk_overlap=150 | |
| ) | |
| chunks = splitter.split_text(text) | |
| # إنشاء embeddings | |
| embeddings = embedding_model.encode(chunks).tolist() | |
| # إضافة إلى قاعدة البيانات | |
| collection.add( | |
| documents=chunks, | |
| embeddings=embeddings, | |
| ids=[f"chunk_{i}" for i in range(len(chunks))] | |
| ) | |
| # البحث عن الأجزاء الأكثر صلة | |
| query_embedding = embedding_model.encode(question).tolist() | |
| docs = collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=3 | |
| ) | |
| # تجميع السياق | |
| context = "\n".join(docs["documents"][0]) if docs["documents"] else "No relevant context found." | |
| # إنشاء الرد باستخدام Groq | |
| prompt = f"""You are a research assistant. Answer the question based ONLY on the provided context. | |
| Context from the document: | |
| {context} | |
| Question: {question} | |
| Instructions: | |
| 1. Answer based ONLY on the information in the context above. | |
| 2. If the context doesn't contain relevant information, say "The document doesn't contain information about this." | |
| 3. Be clear and concise. | |
| 4. Provide page references if available. | |
| 5. Use bullet points for lists when appropriate. | |
| """ | |
| response = groq_client.chat.completions.create( | |
| model="meta-llama/llama-4-scout-17b-16e-instruct", # يمكنك تغيير النموذج إذا أردت | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.1, | |
| max_tokens=500 | |
| ) | |
| answer = response.choices[0].message.content | |
| return answer | |
| except Exception as e: | |
| return f"❌ An error occurred: {str(e)}" | |
| # إنشاء واجهة Gradio | |
| examples = [ | |
| [None, "What is the main idea of this document?"], | |
| [None, "Summarize the content briefly."], | |
| [None, "What methodology is used in this paper?"], | |
| [None, "What are the key results presented by the author?"], | |
| [None, "Explain any important equations or algorithms mentioned."], | |
| [None, "What are the strengths and weaknesses of this work?"], | |
| [None, "Does the paper discuss related or previous work?"], | |
| [None, "What practical applications are proposed?"] | |
| ] | |
| # إصدار باللغة الإنجليزية | |
| interface_en = gr.Interface( | |
| fn=answer_from_pdf, | |
| inputs=[ | |
| gr.File( | |
| label="📄 Upload PDF", | |
| file_types=[".pdf"], | |
| type="filepath" # هذا يضمن تمرير مسار الملف | |
| ), | |
| gr.Textbox( | |
| label="❓ Question", | |
| lines=2, | |
| placeholder="Type your question about the PDF content here..." | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| label="✅ Answer", | |
| lines=10 | |
| ), | |
| title="📚 PDF Research Assistant", | |
| description="Upload a PDF file and ask any question related to its content. The system will extract text and provide answers based on the document.", | |
| examples=examples, | |
| theme=gr.themes.Soft() | |
| ) | |
| # إصدار باللغة العربية | |
| interface_ar = gr.Interface( | |
| fn=answer_from_pdf, | |
| inputs=[ | |
| gr.File( | |
| label="📄 ارفع ملف PDF", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ), | |
| gr.Textbox( | |
| label="❓ السؤال", | |
| lines=2, | |
| placeholder="اكتب سؤالك حول محتوى ملف PDF هنا..." | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| label="✅ الإجابة", | |
| lines=10 | |
| ), | |
| title="📚 مساعد البحث في ملفات PDF", | |
| description="ارفع ملف PDF واسأل أي سؤال متعلق بمحتواه. سيقوم النظام باستخراج النص وتقديم إجابات بناءً على المستند.", | |
| examples=examples, | |
| theme=gr.themes.Soft() | |
| ) | |
| # إنشاء تبويبات للواجهتين | |
| demo = gr.TabbedInterface( | |
| [interface_en, interface_ar], | |
| ["English Version", "النسخة العربية"] | |
| ) | |
| # تشغيل التطبيق | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=True, | |
| debug=False, | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) |