Spaces:

ahm14
/

Exam_Developer

Sleeping

App Files Files Community

ahm14 commited on Jan 5, 2025

Commit

dc17fdf

verified ·

1 Parent(s): 7337e1e

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -22,6 +22,16 @@ llm = ChatGroq(temperature=0.5, groq_api_key="gsk_cnE3PNB19Dg4H2UNQ1zbWGdyb3FYsl
 # OCR Configuration for Pytesseract
 pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
 # Function to extract text, images, tables, and formulas from PDF
 def extract_pdf_data(pdf_path):
     data = {"text": "", "tables": [], "images": []}
@@ -58,18 +68,8 @@ def extract_text_file_data(text_file):
         logging.error(f"Error extracting TXT content: {e}")
         return ""
-# Function to extract text from images using OCR
-def extract_text_from_images(images):
-    ocr_text = ""
-    for image in images:
-        try:
-            ocr_text += pytesseract.image_to_string(image).strip() + "\n"
-        except Exception as e:
-            logging.error(f"Error in OCR: {e}")
-    return ocr_text.strip()
 # Function to process extracted content (PDF, DOCX, etc.)
-def process_content(file_data, file_type):
     text = ""
     images = []
     if file_type == "pdf":
@@ -84,7 +84,7 @@ def process_content(file_data, file_type):
         image = Image.open(file_data)
         images.append(image)
-    ocr_text = extract_text_from_images(images)
     return text + "\n" + ocr_text
 # Function to process PDF content
@@ -174,6 +174,9 @@ instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Nam
 class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
 institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
 if uploaded_file:
     # Clear session state when a new file is uploaded
     if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
@@ -187,7 +190,7 @@ if uploaded_file:
     if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
         st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
     else:
-        syllabus_text = process_content(uploaded_file, file_type)
         st.session_state.syllabus_text = syllabus_text
 # Preview of Syllabus

 # OCR Configuration for Pytesseract
 pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
+# Enhanced OCR with configurable language option
+def extract_text_from_images(images, lang="eng"):
+    ocr_text = ""
+    for image in images:
+        try:
+            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
+        except Exception as e:
+            logging.error(f"Error in OCR: {e}")
+    return ocr_text.strip()
 # Function to extract text, images, tables, and formulas from PDF
 def extract_pdf_data(pdf_path):
     data = {"text": "", "tables": [], "images": []}
         logging.error(f"Error extracting TXT content: {e}")
         return ""
 # Function to process extracted content (PDF, DOCX, etc.)
+def process_content(file_data, file_type, lang="eng"):
     text = ""
     images = []
     if file_type == "pdf":
         image = Image.open(file_data)
         images.append(image)
+    ocr_text = extract_text_from_images(images, lang)
     return text + "\n" + ocr_text
 # Function to process PDF content
 class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
 institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
+# Language Option for OCR
+ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
 if uploaded_file:
     # Clear session state when a new file is uploaded
     if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
     if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
         st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
     else:
+        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
         st.session_state.syllabus_text = syllabus_text
 # Preview of Syllabus