|
|
import gradio as gr
|
|
|
import pdfplumber
|
|
|
import fitz
|
|
|
from sentence_transformers import SentenceTransformer, util
|
|
|
import faiss
|
|
|
import numpy as np
|
|
|
import re
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file):
|
|
|
text = ""
|
|
|
with pdfplumber.open(file.name) as pdf:
|
|
|
for page in pdf.pages:
|
|
|
page_text = page.extract_text()
|
|
|
if page_text:
|
|
|
text += page_text + "\n"
|
|
|
return text
|
|
|
|
|
|
|
|
|
def clean_text(text):
|
|
|
text = re.sub(r'\n+', '\n', text)
|
|
|
text = re.sub(r'[ \t]+', ' ', text)
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
def chunk_text(text, chunk_size=500, overlap=50):
|
|
|
sentences = re.split(r'(?<=[.!?]) +', text)
|
|
|
chunks = []
|
|
|
current_chunk = ""
|
|
|
for sentence in sentences:
|
|
|
if len(current_chunk) + len(sentence) <= chunk_size:
|
|
|
current_chunk += " " + sentence
|
|
|
else:
|
|
|
chunks.append(current_chunk.strip())
|
|
|
current_chunk = sentence
|
|
|
if current_chunk:
|
|
|
chunks.append(current_chunk.strip())
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
|
|
|
def answer_question(pdf_file, question):
|
|
|
|
|
|
raw_text = extract_text_from_pdf(pdf_file)
|
|
|
cleaned_text = clean_text(raw_text)
|
|
|
|
|
|
|
|
|
chunks = chunk_text(cleaned_text)
|
|
|
|
|
|
|
|
|
embeddings = model.encode(chunks)
|
|
|
|
|
|
|
|
|
index = faiss.IndexFlatL2(embeddings.shape[1])
|
|
|
index.add(np.array(embeddings))
|
|
|
|
|
|
|
|
|
question_embedding = model.encode([question])
|
|
|
|
|
|
|
|
|
D, I = index.search(np.array(question_embedding), k=3)
|
|
|
|
|
|
|
|
|
answers = [chunks[i] for i in I[0]]
|
|
|
return "\n\n---\n\n".join(answers)
|
|
|
|
|
|
|
|
|
iface = gr.Interface(
|
|
|
fn=answer_question,
|
|
|
inputs=[
|
|
|
gr.File(label="آپلود فایل PDF", file_types=[".pdf"]),
|
|
|
gr.Textbox(label="پرسش خود را وارد کنید")
|
|
|
],
|
|
|
outputs="text",
|
|
|
title="پاسخ به پرسشها از روی فایل PDF",
|
|
|
description="یک سیستم RAG ساده برای پاسخ به پرسشها از روی محتوای فایل PDF"
|
|
|
)
|
|
|
|
|
|
iface.launch()
|
|
|
|