Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

Answer1 / app.py

ramysaidagieb

Update app.py

9872af4 verified 10 months ago

raw

history blame contribute delete

4.46 kB

	import gradio as gr
	import faiss
	import numpy as np
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from pdfminer.high_level import extract_text
	import docx

	# Initialize global variables
	embedding_model = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-mix')
	index = None
	texts = []

	def extract_text_from_pdf(file_path):
	try:
	return extract_text(file_path)
	except Exception as e:
	print(f"Error extracting from PDF: {e}")
	return ""

	def extract_text_from_docx(file_path):
	try:
	doc = docx.Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"Error extracting from DOCX: {e}")
	return ""

	def process_files(files, progress=gr.Progress()):
	global index, texts

	if not files or len(files) == 0:
	return "⚠️ لم يتم رفع أي ملفات. الرجاء رفع كتاب واحد على الأقل."

	texts = []

	try:
	# Step 1: Extract text
	progress(0.1, desc="جاري استخراج النصوص من الكتب...")
	for file_path in files:
	if isinstance(file_path, str):
	if file_path.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif file_path.endswith(".docx") or file_path.endswith(".doc"):
	text = extract_text_from_docx(file_path)
	else:
	continue

	if text:
	texts.append(text)

	if len(texts) == 0:
	return "⚠️ لم يتم استخراج نصوص صالحة من الملفات."

	# Step 2: Chunk the text
	progress(0.4, desc="تقطيع النصوص إلى فقرات...")
	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = []
	for text in texts:
	chunks.extend(splitter.split_text(text))

	if len(chunks) == 0:
	return "⚠️ لا يوجد محتوى نصي كافٍ للتدريب."

	# Step 3: Embed the text
	progress(0.7, desc="تحويل الفقرات إلى متجهات...")
	embeddings = embedding_model.encode(chunks, show_progress_bar=True)

	# Step 4: Build FAISS index
	progress(0.9, desc="بناء قاعدة بيانات البحث...")
	embeddings = np.array(embeddings).astype(np.float32)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings)
	texts.clear()
	texts.extend(chunks)

	return "✅ النظام جاهز للإجابة على أسئلتك"
	except Exception as e:
	return f"❌ حدث خطأ أثناء التدريب: {str(e)}"

	def answer_question(question):
	global index, texts

	if index is None or len(texts) == 0:
	return "⚠️ الرجاء رفع كتبك وتدريب النظام أولاً."

	try:
	question_embedding = embedding_model.encode([question])
	question_embedding = np.array(question_embedding).astype(np.float32)

	D, I = index.search(question_embedding, k=1)
	if I[0][0] == -1:
	return "❌ لم يتم العثور على إجابة."

	retrieved_chunk = texts[I[0][0]]
	return retrieved_chunk
	except Exception as e:
	return f"❌ حدث خطأ أثناء الإجابة: {str(e)}"

	with gr.Blocks() as demo:
	gr.Markdown("# 📚 نظام محاكاة دماغ المؤلف العربي\nارفع كتبك ودرب النظام للإجابة على أسئلتك باللغة العربية فقط.")

	with gr.Row():
	file_input = gr.File(label="📄 ارفع ملفات الكتب (PDF أو DOCX)", file_types=['.pdf', '.docx', '.doc'], file_count="multiple")

	with gr.Row():
	train_button = gr.Button("🚀 ابدأ التدريب على الكتب")

	output_text = gr.Textbox(label="🔵 حالة التدريب")

	with gr.Row():
	question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا")
	answer_output = gr.Textbox(label="🧠 إجابة النظام")

	train_button.click(fn=process_files, inputs=[file_input], outputs=[output_text])
	question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])

	demo.launch()