Spaces:

Sayeem26s
/

SmartReceipt-AI

Running

App Files Files Community

SmartReceipt-AI / ocr_utils.py

Sayeem26s

Upload 6 files

85a47a4 verified 5 months ago

raw

history blame contribute delete

3.23 kB

	import base64
	import os
	from dotenv import load_dotenv
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.schema import HumanMessage, SystemMessage
	from groq import Groq

	# Load API keys
	load_dotenv()
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")

	# Initialize Gemini LLM
	llm = ChatGoogleGenerativeAI(
	model="gemini-2.5-pro",
	temperature=0,
	max_output_tokens=2048,
	google_api_key=GOOGLE_API_KEY
	)

	# Groq client for Whisper
	groq_client = Groq(api_key=GROQ_API_KEY)

	# System prompt with strict splitting rules
	system_prompt = """
	You are a strict OCR analyst specialized in receipts.

	- Extract ALL text from the uploaded receipt image or provided transcription and represent the text exactly like the receipt (keep spacing/alignment).
	- Do not remove or skip fields that exist on the receipt.
	- Keep spacing aligned, totals right-justified.
	- TOTAL must always be uppercase.
	- If no receipt detected, reply: No receipt detected.

	--- SPLIT BILL INSTRUCTION ---
	If the user requests to split the bill (e.g., "split among 4", "divide bill in four", "split for five people", "guest 3", "3 persons", "two friends", etc.):
	1. Accept both digits (1, 2, 3, 4, etc.) and words ("one", "two", "three", "four", etc.).
	2. Extract the TOTAL from the receipt.
	3. Divide TOTAL by the requested number of persons.
	4. At the END of the receipt output, strictly append in this format:

	---
	Split Bill (N persons): X.XX each
	---

	Where N is the number of persons and X.XX is the per-person share.
	If no split is requested, do not add anything.
	"""

	def extract_receipt_text(uploaded_file):
	"""Convert uploaded receipt image to structured text using Gemini."""
	img_bytes = uploaded_file.getvalue()
	img_base64 = base64.b64encode(img_bytes).decode("utf-8")

	messages = [
	SystemMessage(content=system_prompt),
	HumanMessage(content=[
	{"type": "text", "text": "Extract the receipt text in structured plain text."},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
	])
	]
	response = llm.invoke(messages)
	return response.content

	def extract_from_text(text_input: str):
	"""Send raw text (from transcription or manual input) to Gemini OCR pipeline."""
	messages = [
	SystemMessage(content=system_prompt),
	HumanMessage(content=text_input)
	]
	response = llm.invoke(messages)
	return response.content

	def transcribe_audio(file_path: str) -> str:
	"""Transcribe audio in English using Groq Whisper API."""
	with open(file_path, "rb") as f:
	file_bytes = f.read()

	transcription = groq_client.audio.transcriptions.create(
	file=(file_path, file_bytes),
	model="whisper-large-v3",
	response_format="verbose_json",
	language="en" # Force transcription output in English
	)

	if hasattr(transcription, "text"):
	return transcription.text
	elif isinstance(transcription, dict):
	return transcription.get("text") or transcription.get("transcription") or ""
	return str(transcription)