Spaces:

mkfallah
/

asl01

Sleeping

App Files Files Community

mkfallah commited on Jun 2, 2025

Commit

01a49dc

verified ·

1 Parent(s): c8c3a81

Upload 2 files

Browse files

Files changed (2) hide show

app.py +81 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import gradio as gr
+import pdfplumber
+import fitz  # PyMuPDF
+from sentence_transformers import SentenceTransformer, util
+import faiss
+import numpy as np
+import re
+# تابع استخراج متن از PDF
+def extract_text_from_pdf(file):
+    text = ""
+    with pdfplumber.open(file.name) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+    return text
+# تابع پاک‌سازی متن
+def clean_text(text):
+    text = re.sub(r'\n+', '\n', text)
+    text = re.sub(r'[ \t]+', ' ', text)
+    return text.strip()
+# تابع تقسیم‌بندی متن به بخش‌های معنادار
+def chunk_text(text, chunk_size=500, overlap=50):
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_size:
+            current_chunk += " " + sentence
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# بارگذاری مدل تعبیه
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# تابع اصلی برای پاسخ به پرسش‌ها
+def answer_question(pdf_file, question):
+    # استخراج و پاک‌سازی متن
+    raw_text = extract_text_from_pdf(pdf_file)
+    cleaned_text = clean_text(raw_text)
+    # تقسیم‌بندی متن
+    chunks = chunk_text(cleaned_text)
+    # تعبیه بخش‌ها
+    embeddings = model.encode(chunks)
+    # ساخت ایندکس FAISS
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(np.array(embeddings))
+    # تعبیه پرسش
+    question_embedding = model.encode([question])
+    # جستجوی نزدیک‌ترین بخش‌ها
+    D, I = index.search(np.array(question_embedding), k=3)
+    # جمع‌آوری پاسخ‌ها
+    answers = [chunks[i] for i in I[0]]
+    return "\n\n---\n\n".join(answers)
+# رابط کاربری Gradio
+iface = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.File(label="آپلود فایل PDF", file_types=[".pdf"]),
+        gr.Textbox(label="پرسش خود را وارد کنید")
+    ],
+    outputs="text",
+    title="پاسخ به پرسش‌ها از روی فایل PDF",
+    description="یک سیستم RAG ساده برای پاسخ به پرسش‌ها از روی محتوای فایل PDF"
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=5.0.0
+transformers>=4.39.3
+sentence-transformers>=2.7.0
+faiss-cpu>=1.7.4
+pdfplumber>=0.10.0
+PyMuPDF>=1.22.0