Sayeem26s commited on
Commit
85a47a4
·
verified ·
1 Parent(s): 2de8174

Upload 6 files

Browse files
Files changed (6) hide show
  1. .env +4 -0
  2. .gitignore +68 -0
  3. README.md +151 -20
  4. app.py +72 -0
  5. ocr_utils.py +89 -0
  6. requirements.txt +21 -3
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GROQ_API_KEY=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
2
+ GOOGLE_API_KEY=AIzaSyCSrNap1UdeMX4v2yhGypFp_wz_0HefSYQ
3
+
4
+ #AIzaSyC3FxcupgQE6BggI0LMCwtDPNnY3rCGmKI
.gitignore ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.*
4
+
5
+ # Virtual environments
6
+ venv/
7
+ env/
8
+ .venv/
9
+ .venv*/
10
+
11
+ # Byte-compiled files
12
+ __pycache__/
13
+ *.py[cod]
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+
33
+ # Logs and databases
34
+ *.log
35
+ *.sqlite3
36
+ *.db
37
+
38
+ # IDE-specific files
39
+ .vscode/
40
+ .idea/
41
+ *.iml
42
+
43
+ # OS-specific files
44
+ .DS_Store
45
+ Thumbs.db
46
+ desktop.ini
47
+
48
+ # Test coverage
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ .pytest_cache/
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ .hypothesis/
60
+
61
+ # Jupyter Notebook checkpoints
62
+ .ipynb_checkpoints/
63
+
64
+ # Local configuration files
65
+ *.local
66
+
67
+ # History
68
+ .history
README.md CHANGED
@@ -1,20 +1,151 @@
1
- ---
2
- title: SmartReceipt AI
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- license: apache-2.0
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SmartReceipt AI
2
+
3
+ **SmartReceipt AI** is a multimodal receipt OCR extractor built with **Streamlit**, **Google Gemini (via LangChain)**, and **Groq Whisper** for audio transcription.
4
+ It allows users to upload receipt images or provide speech input and converts them into a **structured plain-text receipt format**, preserving store info, order details, items, totals, gratuity, footers, and optionally splitting bills among guests.
5
+
6
+ ---
7
+
8
+ ## Features
9
+
10
+ * Upload receipt images (`.jpg`, `.jpeg`, `.png`) or provide voice input for instructions.
11
+ * Transcribe speech into English using **Groq Whisper**.
12
+ * Extract **all visible text** from receipts using **Google Gemini multimodal model**.
13
+ * Convert unstructured OCR into a **receipt-style structured layout**.
14
+ * Preserve:
15
+
16
+ * Store details
17
+ * Order information (order #, table, party size, server, date/time)
18
+ * Items with quantity and price
19
+ * Subtotals, tax, TOTAL
20
+ * Extra sections (gratuity, discounts, payment method)
21
+ * Footer messages (e.g., “Thank you”, “Visit again”)
22
+ * **Split the bill** automatically when requested, supporting both numeric and word formats (`4`, `four`, `five persons`, `guest 3`, etc.).
23
+ * Chat-like interface with conversation memory and continuous input.
24
+ * Export extracted receipts to `.txt` files for easy use.
25
+
26
+ ---
27
+
28
+ ## Project Structure
29
+
30
+ ```
31
+ .
32
+ ├── app.py # Streamlit UI: upload, audio input, display, export
33
+ ├── ocr_utils.py # Gemini OCR + Groq Whisper transcription + split bill logic
34
+ ├── requirements.txt # Python dependencies
35
+ ├── .env # Environment variables (API keys)
36
+ └── README.md # Project documentation
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Requirements
42
+
43
+ * Python 3.10 or higher
44
+ * Google Gemini API key (obtain from [https://aistudio.google.com/](https://aistudio.google.com/))
45
+ * Groq API key (for Whisper transcription)
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ 1. Clone the repository:
52
+
53
+ ```bash
54
+ git clone https://github.com/your-username/receipt-ocr-bot.git
55
+ cd receipt-ocr-bot
56
+ ```
57
+
58
+ 2. Create and activate a virtual environment (recommended):
59
+
60
+ ```bash
61
+ python -m venv venv
62
+ source venv/bin/activate # Linux/Mac
63
+ venv\Scripts\activate # Windows
64
+ ```
65
+
66
+ 3. Install dependencies:
67
+
68
+ ```bash
69
+ pip install -r requirements.txt
70
+ ```
71
+
72
+ 4. Create a `.env` file in the project root and add your API keys:
73
+
74
+ ```
75
+ GOOGLE_API_KEY=your_google_gemini_api_key_here
76
+ GROQ_API_KEY=your_groq_api_key_here
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Running the Application
82
+
83
+ Start the Streamlit app:
84
+
85
+ ```bash
86
+ streamlit run app.py
87
+ ```
88
+
89
+ The app will launch in your browser at:
90
+
91
+ ```
92
+ http://localhost:8501
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Usage
98
+
99
+ 1. **Text or Voice Input**:
100
+
101
+ * Type instructions or speech (e.g., “Split the bill among 4”).
102
+ * Optionally, record speech using the mini recorder — the app will transcribe to English automatically.
103
+ 2. **Upload Receipt**:
104
+
105
+ * Upload a receipt image (`.jpg`, `.jpeg`, `.png`).
106
+ 3. **Process OCR**:
107
+
108
+ * Click **Analyze Receipt**.
109
+ * The app extracts all receipt details and formats them in a structured plain-text layout.
110
+ 4. **Split Bill (Optional)**:
111
+
112
+ * If the user requested a split in text/speech, the output automatically shows per-person amounts at the end of the receipt.
113
+ 5. **Download Result**:
114
+
115
+ * Use the **Download as TXT** button to export the structured receipt.
116
+
117
+ ---
118
+
119
+ ## Notes
120
+
121
+ * The system prompt is strictly tuned for **receipts only**.
122
+ * TOTAL amounts are always displayed in uppercase.
123
+ * Bill splitting supports **both numbers and words** (`4`, `four`, `three people`, `guest 2` etc.).
124
+ * Model output is **plain text**; no JSON or Markdown.
125
+ * If no receipt is detected, the model will return: `No receipt detected`.
126
+
127
+ ---
128
+
129
+ ## Production Workflow
130
+
131
+ 1. **Audio Input (Optional)** → Transcribed by **Groq Whisper** → Text prompt.
132
+ 2. **Receipt Image Upload** → OCR by **Google Gemini** → Raw text.
133
+ 3. **Structured Formatting** → Apply receipt layout rules and alignment.
134
+ 4. **Split Bill Logic** → Handled automatically by the system prompt when requested.
135
+ 5. **Display & Export** → Streamlit shows structured receipt + download option.
136
+
137
+ ---
138
+
139
+ ## Support
140
+
141
+ For issues, questions, or collaboration, contact:
142
+ **[syaeem26s@gmail.com](mailto:syaeem26s@gmail.com)**
143
+
144
+ ---
145
+
146
+ If you want, I can also **update your `app.py` in a fully production-ready style** with:
147
+
148
+ * Clean UI
149
+ * Mini voice recorder + text input combined
150
+ * Auto split bill handled via system prompt
151
+ * Continuous session state for chat-like experience
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ from ocr_utils import extract_receipt_text, extract_from_text, transcribe_audio
4
+ from streamlit_mic_recorder import mic_recorder
5
+ import tempfile
6
+ import os
7
+
8
+ # ------------------ Streamlit UI ------------------
9
+ st.set_page_config(page_title="SmartReceipt AI", layout="centered")
10
+ st.title("SmartReceipt AI")
11
+ st.write("Provide your text or speech And upload a receipt image to extract structured plain-text.")
12
+
13
+ # Session state
14
+ if "user_text" not in st.session_state:
15
+ st.session_state.user_text = ""
16
+ if "uploaded_image" not in st.session_state:
17
+ st.session_state.uploaded_image = None
18
+ if "ocr_result" not in st.session_state:
19
+ st.session_state.ocr_result = None
20
+
21
+ # ---------------- Input: User Text or Speech ----------------
22
+ st.subheader("Enter text or record speech")
23
+
24
+ # Text input field
25
+ st.session_state.user_text = st.text_area("Type your input here:", st.session_state.user_text, height=100)
26
+
27
+ # Mic recorder
28
+ audio = mic_recorder(
29
+ start_prompt="Start Recording",
30
+ stop_prompt="Stop Recording",
31
+ just_once=True,
32
+ use_container_width=True
33
+ )
34
+
35
+ if audio and "bytes" in audio:
36
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
37
+ tmp_file.write(audio["bytes"])
38
+ tmp_path = tmp_file.name
39
+
40
+ transcribed_text = transcribe_audio(tmp_path)
41
+ st.session_state.user_text = transcribed_text
42
+ st.text_area("Transcribed Text:", transcribed_text, height=100)
43
+ os.remove(tmp_path)
44
+
45
+ # ---------------- Input: Receipt Image ----------------
46
+ uploaded_file = st.file_uploader("Upload a receipt (JPG/PNG)", type=["jpg", "jpeg", "png"])
47
+ if uploaded_file:
48
+ st.session_state.uploaded_image = uploaded_file
49
+ image = Image.open(uploaded_file)
50
+ st.image(image, caption="Uploaded Receipt", width=400)
51
+
52
+ # ---------------- Run OCR ----------------
53
+ if st.button("Analyze Receipt"):
54
+ if st.session_state.user_text.strip() and st.session_state.uploaded_image:
55
+ with st.spinner("Processing..."):
56
+ ocr_text = extract_receipt_text(st.session_state.uploaded_image)
57
+ model_input_text = st.session_state.user_text
58
+ final_result = extract_from_text(f"User Prompt: {model_input_text}\n\n{ocr_text}")
59
+ st.session_state.ocr_result = final_result
60
+ else:
61
+ st.warning("Please provide both a user prompt (text or speech) and a receipt image.")
62
+
63
+ # ---------------- Show Result ----------------
64
+ if st.session_state.ocr_result:
65
+ st.subheader("Extracted Receipt Text")
66
+ st.text_area("OCR Result", st.session_state.ocr_result, height=400)
67
+ st.download_button(
68
+ "Download Receipt as TXT",
69
+ data=st.session_state.ocr_result,
70
+ file_name="receipt_output.txt",
71
+ mime="text/plain"
72
+ )
ocr_utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from langchain.schema import HumanMessage, SystemMessage
6
+ from groq import Groq
7
+
8
+ # Load API keys
9
+ load_dotenv()
10
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
11
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
12
+
13
+ # Initialize Gemini LLM
14
+ llm = ChatGoogleGenerativeAI(
15
+ model="gemini-2.5-pro",
16
+ temperature=0,
17
+ max_output_tokens=2048,
18
+ google_api_key=GOOGLE_API_KEY
19
+ )
20
+
21
+ # Groq client for Whisper
22
+ groq_client = Groq(api_key=GROQ_API_KEY)
23
+
24
+ # System prompt with strict splitting rules
25
+ system_prompt = """
26
+ You are a strict OCR analyst specialized in receipts.
27
+
28
+ - Extract ALL text from the uploaded receipt image or provided transcription and represent the text exactly like the receipt (keep spacing/alignment).
29
+ - Do not remove or skip fields that exist on the receipt.
30
+ - Keep spacing aligned, totals right-justified.
31
+ - TOTAL must always be uppercase.
32
+ - If no receipt detected, reply: No receipt detected.
33
+
34
+ --- SPLIT BILL INSTRUCTION ---
35
+ If the user requests to split the bill (e.g., "split among 4", "divide bill in four", "split for five people", "guest 3", "3 persons", "two friends", etc.):
36
+ 1. Accept both digits (1, 2, 3, 4, etc.) and words ("one", "two", "three", "four", etc.).
37
+ 2. Extract the TOTAL from the receipt.
38
+ 3. Divide TOTAL by the requested number of persons.
39
+ 4. At the END of the receipt output, strictly append in this format:
40
+
41
+ ---
42
+ Split Bill (N persons): X.XX each
43
+ ---
44
+
45
+ Where N is the number of persons and X.XX is the per-person share.
46
+ If no split is requested, do not add anything.
47
+ """
48
+
49
+ def extract_receipt_text(uploaded_file):
50
+ """Convert uploaded receipt image to structured text using Gemini."""
51
+ img_bytes = uploaded_file.getvalue()
52
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
53
+
54
+ messages = [
55
+ SystemMessage(content=system_prompt),
56
+ HumanMessage(content=[
57
+ {"type": "text", "text": "Extract the receipt text in structured plain text."},
58
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
59
+ ])
60
+ ]
61
+ response = llm.invoke(messages)
62
+ return response.content
63
+
64
+ def extract_from_text(text_input: str):
65
+ """Send raw text (from transcription or manual input) to Gemini OCR pipeline."""
66
+ messages = [
67
+ SystemMessage(content=system_prompt),
68
+ HumanMessage(content=text_input)
69
+ ]
70
+ response = llm.invoke(messages)
71
+ return response.content
72
+
73
+ def transcribe_audio(file_path: str) -> str:
74
+ """Transcribe audio in English using Groq Whisper API."""
75
+ with open(file_path, "rb") as f:
76
+ file_bytes = f.read()
77
+
78
+ transcription = groq_client.audio.transcriptions.create(
79
+ file=(file_path, file_bytes),
80
+ model="whisper-large-v3",
81
+ response_format="verbose_json",
82
+ language="en" # Force transcription output in English
83
+ )
84
+
85
+ if hasattr(transcription, "text"):
86
+ return transcription.text
87
+ elif isinstance(transcription, dict):
88
+ return transcription.get("text") or transcription.get("transcription") or ""
89
+ return str(transcription)
requirements.txt CHANGED
@@ -1,3 +1,21 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core Streamlit app ---
2
+ streamlit
3
+ pillow
4
+ python-dotenv
5
+
6
+ # --- LangChain + Gemini ---
7
+ langchain
8
+ langchain-google-genai
9
+ google-generativeai
10
+
11
+ # --- Groq Whisper API ---
12
+ groq
13
+
14
+ # --- Audio Recording (choose ONE) ---
15
+ # For st_audiorec (GitHub install)
16
+ #git+https://github.com/stefanrmmr/streamlit_audio_recorder
17
+ # OR
18
+ streamlit-mic-recorder
19
+
20
+ # --- Helpers ---
21
+ tqdm