ahm14 commited on
Commit
dc17fdf
·
verified ·
1 Parent(s): 7337e1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -22,6 +22,16 @@ llm = ChatGroq(temperature=0.5, groq_api_key="gsk_cnE3PNB19Dg4H2UNQ1zbWGdyb3FYsl
22
  # OCR Configuration for Pytesseract
23
  pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
24
 
 
 
 
 
 
 
 
 
 
 
25
  # Function to extract text, images, tables, and formulas from PDF
26
  def extract_pdf_data(pdf_path):
27
  data = {"text": "", "tables": [], "images": []}
@@ -58,18 +68,8 @@ def extract_text_file_data(text_file):
58
  logging.error(f"Error extracting TXT content: {e}")
59
  return ""
60
 
61
- # Function to extract text from images using OCR
62
- def extract_text_from_images(images):
63
- ocr_text = ""
64
- for image in images:
65
- try:
66
- ocr_text += pytesseract.image_to_string(image).strip() + "\n"
67
- except Exception as e:
68
- logging.error(f"Error in OCR: {e}")
69
- return ocr_text.strip()
70
-
71
  # Function to process extracted content (PDF, DOCX, etc.)
72
- def process_content(file_data, file_type):
73
  text = ""
74
  images = []
75
  if file_type == "pdf":
@@ -84,7 +84,7 @@ def process_content(file_data, file_type):
84
  image = Image.open(file_data)
85
  images.append(image)
86
 
87
- ocr_text = extract_text_from_images(images)
88
  return text + "\n" + ocr_text
89
 
90
  # Function to process PDF content
@@ -174,6 +174,9 @@ instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Nam
174
  class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
175
  institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
176
 
 
 
 
177
  if uploaded_file:
178
  # Clear session state when a new file is uploaded
179
  if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
@@ -187,7 +190,7 @@ if uploaded_file:
187
  if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
188
  st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
189
  else:
190
- syllabus_text = process_content(uploaded_file, file_type)
191
  st.session_state.syllabus_text = syllabus_text
192
 
193
  # Preview of Syllabus
 
22
  # OCR Configuration for Pytesseract
23
  pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
24
 
25
+ # Enhanced OCR with configurable language option
26
+ def extract_text_from_images(images, lang="eng"):
27
+ ocr_text = ""
28
+ for image in images:
29
+ try:
30
+ ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
31
+ except Exception as e:
32
+ logging.error(f"Error in OCR: {e}")
33
+ return ocr_text.strip()
34
+
35
  # Function to extract text, images, tables, and formulas from PDF
36
  def extract_pdf_data(pdf_path):
37
  data = {"text": "", "tables": [], "images": []}
 
68
  logging.error(f"Error extracting TXT content: {e}")
69
  return ""
70
 
 
 
 
 
 
 
 
 
 
 
71
  # Function to process extracted content (PDF, DOCX, etc.)
72
+ def process_content(file_data, file_type, lang="eng"):
73
  text = ""
74
  images = []
75
  if file_type == "pdf":
 
84
  image = Image.open(file_data)
85
  images.append(image)
86
 
87
+ ocr_text = extract_text_from_images(images, lang)
88
  return text + "\n" + ocr_text
89
 
90
  # Function to process PDF content
 
174
  class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
175
  institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
176
 
177
+ # Language Option for OCR
178
+ ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
179
+
180
  if uploaded_file:
181
  # Clear session state when a new file is uploaded
182
  if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
 
190
  if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
191
  st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
192
  else:
193
+ syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
194
  st.session_state.syllabus_text = syllabus_text
195
 
196
  # Preview of Syllabus