SmartReceipt-AI / ocr_utils.py
Sayeem26s's picture
Upload 6 files
85a47a4 verified
import base64
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage, SystemMessage
from groq import Groq
# Load API keys
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-pro",
temperature=0,
max_output_tokens=2048,
google_api_key=GOOGLE_API_KEY
)
# Groq client for Whisper
groq_client = Groq(api_key=GROQ_API_KEY)
# System prompt with strict splitting rules
system_prompt = """
You are a strict OCR analyst specialized in receipts.
- Extract ALL text from the uploaded receipt image or provided transcription and represent the text exactly like the receipt (keep spacing/alignment).
- Do not remove or skip fields that exist on the receipt.
- Keep spacing aligned, totals right-justified.
- TOTAL must always be uppercase.
- If no receipt detected, reply: No receipt detected.
--- SPLIT BILL INSTRUCTION ---
If the user requests to split the bill (e.g., "split among 4", "divide bill in four", "split for five people", "guest 3", "3 persons", "two friends", etc.):
1. Accept both digits (1, 2, 3, 4, etc.) and words ("one", "two", "three", "four", etc.).
2. Extract the TOTAL from the receipt.
3. Divide TOTAL by the requested number of persons.
4. At the END of the receipt output, strictly append in this format:
---
Split Bill (N persons): X.XX each
---
Where N is the number of persons and X.XX is the per-person share.
If no split is requested, do not add anything.
"""
def extract_receipt_text(uploaded_file):
"""Convert uploaded receipt image to structured text using Gemini."""
img_bytes = uploaded_file.getvalue()
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
messages = [
SystemMessage(content=system_prompt),
HumanMessage(content=[
{"type": "text", "text": "Extract the receipt text in structured plain text."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
])
]
response = llm.invoke(messages)
return response.content
def extract_from_text(text_input: str):
"""Send raw text (from transcription or manual input) to Gemini OCR pipeline."""
messages = [
SystemMessage(content=system_prompt),
HumanMessage(content=text_input)
]
response = llm.invoke(messages)
return response.content
def transcribe_audio(file_path: str) -> str:
"""Transcribe audio in English using Groq Whisper API."""
with open(file_path, "rb") as f:
file_bytes = f.read()
transcription = groq_client.audio.transcriptions.create(
file=(file_path, file_bytes),
model="whisper-large-v3",
response_format="verbose_json",
language="en" # Force transcription output in English
)
if hasattr(transcription, "text"):
return transcription.text
elif isinstance(transcription, dict):
return transcription.get("text") or transcription.get("transcription") or ""
return str(transcription)