Spaces:

F-allahmoradi
/

OCR

Sleeping

App Files Files Community

F-allahmoradi commited on Oct 21, 2025

Commit

8bec47b

verified ·

1 Parent(s): 52268f5

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -12

app.py CHANGED Viewed

@@ -36,7 +36,6 @@ def normalize_text(text: str) -> str:
     for wrong, correct in corrections.items():
         text = text.replace(wrong, correct)
-    # حذف کلمات غیرمعنی‌دار یا عدد/حرف لاتین وسط متن
     text = re.sub(r'\b[0-9a-zA-Z\-]+\b', '', text)
     text = re.sub(r'[^\w\s\u200c\u200d\u200e\u200f\u0600-\u06FF]', ' ', text)
     text = re.sub(r'\s+', ' ', text).strip()
@@ -50,19 +49,18 @@ def normalize_text(text: str) -> str:
 # حذف نویز و اعداد وسط متن
 # ----------------------------------------------------------------------
 def remove_noise(text: str) -> str:
-    # حذف اعداد فارسی و انگلیسی وسط متن
     text = re.sub(r'\b[0-9۰-۹]+\b', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # ----------------------------------------------------------------------
-# فرمت شبیه کتاب (حذف شکست خط وسط جمله)
 # ----------------------------------------------------------------------
 def format_as_book(text: str) -> str:
-    text = re.sub(r'\n+', '\n', text)  # چند خط پشت سر هم → ۱ خط
-    text = re.sub(r'(?<![.؟!])\n\s*', ' ', text)  # خط وسط جمله → فاصله
-    text = re.sub(r'\s+', ' ', text).strip()  # حذف فاصله اضافی
-    text = re.sub(r'([.؟!])\s+', r'\1\n', text)  # بعد از پایان جمله، خط جدید
     return text
 # ----------------------------------------------------------------------
@@ -78,7 +76,7 @@ def ocr_from_image(image: Image.Image) -> str:
     return text
 # ----------------------------------------------------------------------
-# OCR از PDF
 # ----------------------------------------------------------------------
 def ocr_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
     try:
@@ -91,13 +89,26 @@ def ocr_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
             return "❌ شماره صفحه شروع از تعداد کل صفحات بیشتر است."
         images = convert_from_path(pdf_path, dpi=300, first_page=start_page, last_page=end_page)
         all_text = f"📊 استخراج صفحات {start_page} تا {end_page} از {total_pages} صفحه:\n\n"
         for i, img in enumerate(images):
             page_num = start_page + i
             text = ocr_from_image(img)
             all_text += f"--- صفحه {page_num} ---\n{text}\n\n"
         return all_text.strip()
     except Exception as e:
@@ -162,7 +173,7 @@ def main():
             if st.button("🚀 استخراج متن از PDF", use_container_width=True):
                 with st.spinner("در حال پردازش..."):
                     result = ocr_from_pdf(tmp_path, start_page, end_page)
-                    st.markdown("### 📝 متن استخراج‌شده (فرمت کتابی)")
                     st.text_area("📘 خروجی OCR", result, height=600)
                     st.download_button("📥 دانلود متن", result, file_name="extracted_text.txt")
@@ -172,7 +183,7 @@ def main():
                 with st.spinner("در حال پردازش..."):
                     image = Image.open(tmp_path)
                     result = ocr_from_image(image)
-                    st.markdown("### 📝 متن استخراج‌شده (فرمت کتابی)")
                     st.text_area("📘 خروجی OCR", result, height=600)
                     st.download_button("📥 دانلود متن", result, file_name="extracted_text.txt")
@@ -183,4 +194,4 @@ def main():
         st.info("📁 لطفاً یک فایل آپلود کنید.")
 if __name__ == "__main__":
-    main()

     for wrong, correct in corrections.items():
         text = text.replace(wrong, correct)
     text = re.sub(r'\b[0-9a-zA-Z\-]+\b', '', text)
     text = re.sub(r'[^\w\s\u200c\u200d\u200e\u200f\u0600-\u06FF]', ' ', text)
     text = re.sub(r'\s+', ' ', text).strip()
 # حذف نویز و اعداد وسط متن
 # ----------------------------------------------------------------------
 def remove_noise(text: str) -> str:
     text = re.sub(r'\b[0-9۰-۹]+\b', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # ----------------------------------------------------------------------
+# فرمت شبیه کتاب
 # ----------------------------------------------------------------------
 def format_as_book(text: str) -> str:
+    text = re.sub(r'\n+', '\n', text)
+    text = re.sub(r'(?<![.؟!])\n\s*', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r'([.؟!])\s+', r'\1\n', text)
     return text
 # ----------------------------------------------------------------------
     return text
 # ----------------------------------------------------------------------
+# OCR از PDF (با نمایش پیشرفت)
 # ----------------------------------------------------------------------
 def ocr_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
     try:
             return "❌ شماره صفحه شروع از تعداد کل صفحات بیشتر است."
         images = convert_from_path(pdf_path, dpi=300, first_page=start_page, last_page=end_page)
         all_text = f"📊 استخراج صفحات {start_page} تا {end_page} از {total_pages} صفحه:\n\n"
+        # 🟢 نوار پیشرفت و متن وضعیت
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        total_to_process = len(images)
         for i, img in enumerate(images):
             page_num = start_page + i
+            status_text.text(f"در حال پردازش صفحه {page_num} از {end_page} ...")
+            # OCR صفحه
             text = ocr_from_image(img)
             all_text += f"--- صفحه {page_num} ---\n{text}\n\n"
+            # بروزرسانی درصد
+            percent = int(((i + 1) / total_to_process) * 100)
+            progress_bar.progress(percent)
+        status_text.text("✅ پردازش کامل شد.")
         return all_text.strip()
     except Exception as e:
             if st.button("🚀 استخراج متن از PDF", use_container_width=True):
                 with st.spinner("در حال پردازش..."):
                     result = ocr_from_pdf(tmp_path, start_page, end_page)
+                    st.markdown("### 📝 متن استخراج‌شده ")
                     st.text_area("📘 خروجی OCR", result, height=600)
                     st.download_button("📥 دانلود متن", result, file_name="extracted_text.txt")
                 with st.spinner("در حال پردازش..."):
                     image = Image.open(tmp_path)
                     result = ocr_from_image(image)
+                    st.markdown("### 📝 متن استخراج‌شده")
                     st.text_area("📘 خروجی OCR", result, height=600)
                     st.download_button("📥 دانلود متن", result, file_name="extracted_text.txt")
         st.info("📁 لطفاً یک فایل آپلود کنید.")
 if __name__ == "__main__":
+    main()