suprimedev commited on
Commit
6b784e9
·
verified ·
1 Parent(s): b5f1a8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -43
app.py CHANGED
@@ -1,50 +1,54 @@
1
- import os
2
  import fitz
3
- import easyocr
4
- from PIL import Image
5
  import arabic_reshaper
 
 
6
 
7
- reader = easyocr.Reader(['fa','ar','en'], gpu=False)
 
8
 
9
  def extract_text_from_pdf(pdf_file):
10
  if pdf_file is None:
11
  return "لطفاً یک فایل PDF آپلود کنید.", None
12
-
13
- # تبدیل مسیر یا فایل به مسیر قابل استفاده
14
- if hasattr(pdf_file, "name"): # اگر TemporaryFile است
15
- pdf_path = pdf_file.name
16
- elif isinstance(pdf_file, str): # اگر مسیر string است
17
- pdf_path = pdf_file
18
- else:
19
- return "نوع فایل نامعتبر است.", None
20
-
21
- try:
22
- pdf_document = fitz.open(pdf_path)
23
- all_text = []
24
-
25
- for page_num in range(len(pdf_document)):
26
- page = pdf_document[page_num]
27
- text = page.get_text("text")
28
-
29
- if not text.strip() or len(set(text)) < 10:
30
- pix = page.get_pixmap(dpi=150)
31
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
32
- text_lines = reader.readtext(img, detail=0)
33
- text = "\n".join(text_lines)
34
-
35
- if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
36
- text = arabic_reshaper.reshape(text)
37
-
38
- all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
39
-
40
- pdf_document.close()
41
- extracted_text = "\n".join(all_text)
42
-
43
- output_file = "extracted_text.txt"
44
- with open(output_file, "w", encoding="utf-8") as f:
45
- f.write(extracted_text)
46
-
47
- return extracted_text, output_file
48
-
49
- except Exception as e:
50
- return f"خطا در پردازش فایل: {str(e)}", None
 
 
 
 
1
+ import gradio as gr
2
  import fitz
 
 
3
  import arabic_reshaper
4
+ from PIL import Image
5
+ import easyocr
6
 
7
+ # Reader سبک برای کاهش مصرف CPU و زمان لود
8
+ reader = easyocr.Reader(['fa','ar','en'], gpu=False, detector='craft_mini')
9
 
10
  def extract_text_from_pdf(pdf_file):
11
  if pdf_file is None:
12
  return "لطفاً یک فایل PDF آپلود کنید.", None
13
+
14
+ pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
15
+
16
+ all_text = []
17
+ pdf_document = fitz.open(pdf_path)
18
+
19
+ for page_num, page in enumerate(pdf_document):
20
+ text = page.get_text("text")
21
+
22
+ # فقط اگر متن خالی یا مشکل‌دار بود → OCR
23
+ if not text.strip() or len(set(text)) < 10:
24
+ pix = page.get_pixmap(dpi=150)
25
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
26
+ text = "\n".join(reader.readtext(img, detail=0))
27
+
28
+ if any('\u0600' <= c <= '\u06FF' or '\u0750' <= c <= '\u077F' for c in text):
29
+ text = arabic_reshaper.reshape(text)
30
+
31
+ all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
32
+
33
+ pdf_document.close()
34
+ extracted_text = "\n".join(all_text)
35
+
36
+ output_file = "extracted_text.txt"
37
+ with open(output_file, "w", encoding="utf-8") as f:
38
+ f.write(extracted_text)
39
+
40
+ return extracted_text, output_file
41
+
42
+ def create_interface():
43
+ with gr.Blocks() as interface:
44
+ gr.Markdown("## استخراج متن PDF سریع و کم‌مصرف")
45
+ pdf_input = gr.File(label="آپلود PDF", file_types=[".pdf"], type="filepath")
46
+ extract_btn = gr.Button("استخراج متن")
47
+ text_output = gr.Textbox(label="متن استخراج شده", lines=20)
48
+ download_output = gr.File(label="دانلود TXT")
49
+ extract_btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, download_output])
50
+ return interface
51
+
52
+ if __name__ == "__main__":
53
+ interface = create_interface()
54
+ interface.launch()