Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

Data_Extraction_OCR / app.py

PRIYANSHUDHAKED

Update app.py

d1a52ca verified over 1 year ago

raw

history blame contribute delete

5.61 kB

	import os
	import google.generativeai as genai
	from PIL import Image
	import io
	import streamlit as st
	import re

	# Google Gemini API Key
	GOOGLE_API_KEY = os.getenv("AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

	# Configure Google Gemini with your API key
	genai.configure(api_key="AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

	# Create a GenerativeModel instance
	model = genai.GenerativeModel("gemini-1.5-flash")

	def extract_text_with_gemini(image, keyword=None):
	if keyword:
	prompt = f"""
	1. Extract all text from this image.
	2. Search for the keyword '{keyword}' (case-insensitive) in the extracted text.
	3. Provide the output as HTML, maintaining the general layout and structure of the document.
	4. Highlight all instances of the keyword '{keyword}' with a yellow background using HTML span tags.
	For example: <span style="background-color: yellow;">keyword</span>
	5. If the keyword is not found, simply return the extracted text without highlighting.
	"""
	else:
	prompt = """
	Extract all text from this image. Provide the output as plain text, maintaining the general layout and structure of the document. Include all visible text, headings, and any important information.
	"""

	response = model.generate_content([prompt, image])
	text = response.text

	if not keyword:
	# Remove HTML tags from the extracted text when no keyword is provided
	text = re.sub(r'<[^>]+>', '', text)

	return text

	def extract_ner_with_gemini(image):
	prompt = """
	Analyze this image and extract all Named Entities (NER) present in the text.
	Categorize them into types such as Person, Organization, Location, Date, etc.
	Provide the output as a formatted list with categories and entities.
	"""

	response = model.generate_content([prompt, image])
	ner_text = response.text

	return ner_text

	def search_and_highlight(full_text, keyword):
	pattern = re.compile(re.escape(keyword), re.IGNORECASE)
	matches = list(pattern.finditer(full_text))
	if not matches:
	return [], full_text

	highlighted_text = full_text
	results = []
	for match in reversed(matches):
	start, end = match.span()
	context_start = max(0, start - 50)
	context_end = min(len(full_text), end + 50)
	context = full_text[context_start:context_end]

	# Highlight for results list
	highlighted_context = (
	context[:start-context_start] +
	f'<span style="background-color: yellow;">{context[start-context_start:end-context_start]}</span>' +
	context[end-context_start:]
	)
	results.append(highlighted_context)

	# Highlight for full text
	highlighted_text = (
	highlighted_text[:start] +
	f'<span style="background-color: yellow;">{highlighted_text[start:end]}</span>' +
	highlighted_text[end:]
	)

	return results, highlighted_text

	def app():
	st.title("Image OCR, Search, and NER Extraction")
	uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

	if uploaded_file is not None:
	# Open and display the image
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	# Select search method
	search_method = st.radio("Choose search method:",
	("Extract text first, then search",
	"Search while extracting text (using Gemini API)"))

	search_keyword = st.text_input("Enter a keyword to search (or press Enter to exit)")

	col1, col2 = st.columns(2)
	with col1:
	if st.button("Process Image"):
	if search_method == "Extract text first, then search":
	print("Extracting text from the image...")
	extracted_text = extract_text_with_gemini(image)
	st.subheader("Extracted Text:")
	st.write(extracted_text)

	if search_keyword:
	results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
	if results:
	st.subheader(f"Keyword '{search_keyword}' found in the extracted text:")
	for i, result in enumerate(results, 1):
	st.markdown(f"{i}. ...{result}...", unsafe_allow_html=True)

	st.subheader("Full Text with Highlighted Keywords:")
	st.markdown(highlighted_text, unsafe_allow_html=True)
	else:
	st.write(f"Keyword '{search_keyword}' not found in the extracted text.")

	else: # Search while extracting text using Gemini API
	print("Extracting text and searching keyword using Gemini API...")
	highlighted_text = extract_text_with_gemini(image, search_keyword)
	st.subheader("Extracted Text with Highlighted Keyword:")
	st.markdown(highlighted_text, unsafe_allow_html=True)

	st.write("OCR and search completed.")

	with col2:
	if st.button("Extract NER"):
	print("Extracting Named Entities...")
	ner_results = extract_ner_with_gemini(image)
	st.subheader("Named Entities Extracted:")
	st.write(ner_results)

	if __name__ == "__main__":
	app()