Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,16 @@ llm = ChatGroq(temperature=0.5, groq_api_key="gsk_cnE3PNB19Dg4H2UNQ1zbWGdyb3FYsl
|
|
| 22 |
# OCR Configuration for Pytesseract
|
| 23 |
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Function to extract text, images, tables, and formulas from PDF
|
| 26 |
def extract_pdf_data(pdf_path):
|
| 27 |
data = {"text": "", "tables": [], "images": []}
|
|
@@ -58,18 +68,8 @@ def extract_text_file_data(text_file):
|
|
| 58 |
logging.error(f"Error extracting TXT content: {e}")
|
| 59 |
return ""
|
| 60 |
|
| 61 |
-
# Function to extract text from images using OCR
|
| 62 |
-
def extract_text_from_images(images):
|
| 63 |
-
ocr_text = ""
|
| 64 |
-
for image in images:
|
| 65 |
-
try:
|
| 66 |
-
ocr_text += pytesseract.image_to_string(image).strip() + "\n"
|
| 67 |
-
except Exception as e:
|
| 68 |
-
logging.error(f"Error in OCR: {e}")
|
| 69 |
-
return ocr_text.strip()
|
| 70 |
-
|
| 71 |
# Function to process extracted content (PDF, DOCX, etc.)
|
| 72 |
-
def process_content(file_data, file_type):
|
| 73 |
text = ""
|
| 74 |
images = []
|
| 75 |
if file_type == "pdf":
|
|
@@ -84,7 +84,7 @@ def process_content(file_data, file_type):
|
|
| 84 |
image = Image.open(file_data)
|
| 85 |
images.append(image)
|
| 86 |
|
| 87 |
-
ocr_text = extract_text_from_images(images)
|
| 88 |
return text + "\n" + ocr_text
|
| 89 |
|
| 90 |
# Function to process PDF content
|
|
@@ -174,6 +174,9 @@ instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Nam
|
|
| 174 |
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
|
| 175 |
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
|
| 176 |
|
|
|
|
|
|
|
|
|
|
| 177 |
if uploaded_file:
|
| 178 |
# Clear session state when a new file is uploaded
|
| 179 |
if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
|
|
@@ -187,7 +190,7 @@ if uploaded_file:
|
|
| 187 |
if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
|
| 188 |
st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
|
| 189 |
else:
|
| 190 |
-
syllabus_text = process_content(uploaded_file, file_type)
|
| 191 |
st.session_state.syllabus_text = syllabus_text
|
| 192 |
|
| 193 |
# Preview of Syllabus
|
|
|
|
| 22 |
# OCR Configuration for Pytesseract
|
| 23 |
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
|
| 24 |
|
| 25 |
+
# Enhanced OCR with configurable language option
|
| 26 |
+
def extract_text_from_images(images, lang="eng"):
|
| 27 |
+
ocr_text = ""
|
| 28 |
+
for image in images:
|
| 29 |
+
try:
|
| 30 |
+
ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logging.error(f"Error in OCR: {e}")
|
| 33 |
+
return ocr_text.strip()
|
| 34 |
+
|
| 35 |
# Function to extract text, images, tables, and formulas from PDF
|
| 36 |
def extract_pdf_data(pdf_path):
|
| 37 |
data = {"text": "", "tables": [], "images": []}
|
|
|
|
| 68 |
logging.error(f"Error extracting TXT content: {e}")
|
| 69 |
return ""
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Function to process extracted content (PDF, DOCX, etc.)
|
| 72 |
+
def process_content(file_data, file_type, lang="eng"):
|
| 73 |
text = ""
|
| 74 |
images = []
|
| 75 |
if file_type == "pdf":
|
|
|
|
| 84 |
image = Image.open(file_data)
|
| 85 |
images.append(image)
|
| 86 |
|
| 87 |
+
ocr_text = extract_text_from_images(images, lang)
|
| 88 |
return text + "\n" + ocr_text
|
| 89 |
|
| 90 |
# Function to process PDF content
|
|
|
|
| 174 |
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
|
| 175 |
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
|
| 176 |
|
| 177 |
+
# Language Option for OCR
|
| 178 |
+
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
|
| 179 |
+
|
| 180 |
if uploaded_file:
|
| 181 |
# Clear session state when a new file is uploaded
|
| 182 |
if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
|
|
|
|
| 190 |
if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
|
| 191 |
st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
|
| 192 |
else:
|
| 193 |
+
syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
|
| 194 |
st.session_state.syllabus_text = syllabus_text
|
| 195 |
|
| 196 |
# Preview of Syllabus
|