Spaces:
Sleeping
Sleeping
File size: 4,515 Bytes
0a778db f77d199 0a778db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# ------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------
# OCR library to read text from images
import pytesseract
# (FOR WINDOWS USERS) explicitly set tesseract.exe location
# Change the path if Tesseract is installed somewhere else
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# For image loading and manipulation
from PIL import Image
# Vector database for storing embeddings locally
import chromadb
# Local sentence embedding model
from sentence_transformers import SentenceTransformer
# Simple web UI framework
import gradio as gr
# Create unique IDs for storing sentences
import uuid
# ------------------------------------------------------------
# 2. Load local embedding model
# ------------------------------------------------------------
# This model converts text into vectors (numbers)
# We use a small, fast model — runs on CPU
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ------------------------------------------------------------
# 3. Create local ChromaDB database
# ------------------------------------------------------------
# Create Chroma client (local DB in memory by default)
client = chromadb.CloudClient(
api_key='ck-3TKpYcZnQiMFRYMs5XPusnJjcwJ1DekHF5eAK6Eixg3i',
tenant='a8aa043d-7905-4da1-9937-197415021b8c',
database='TEST 1'
)
# Create or access a collection (like a table in DB)
collection = client.create_collection("image_rag_final2")
# ------------------------------------------------------------
# 4. Function: process image and extract text
# ------------------------------------------------------------
def process_image(image):
# Convert uploaded numpy array image into PIL format
img = Image.fromarray(image)
# Run OCR to extract text from image
text = pytesseract.image_to_string(img)
# If no text found
if text.strip() == "":
return "No text detected in image."
# Split OCR text into separate lines/sentences
sentences = [s.strip() for s in text.split("\n") if s.strip()]
# Convert each sentence to vector embedding
embeddings = embedder.encode(sentences).tolist()
# Generate unique ID for each sentence
ids = [str(uuid.uuid4()) for _ in sentences]
# Store sentences & embeddings into Chroma vector DB
collection.add(
documents=sentences,
embeddings=embeddings,
ids=ids
)
# Return extracted text so user can see it
return "Image processed and stored. Extracted text:\n\n" + "\n".join(sentences)
# ------------------------------------------------------------
# 5. Function: answer questions based on stored image text
# ------------------------------------------------------------
def answer_question(question):
# Ask user to type something
if question.strip() == "":
return "Please enter a question."
# Convert question into embedding vector
query_embedding = embedder.encode([question]).tolist()
# Search top 1 similar text from ChromaDB
results = collection.query(
query_embeddings=query_embedding,
n_results=1
)
# If no images were uploaded before asking question
if not results["documents"]:
return "No data yet. Upload an image first."
# Get the best matching sentence
best_sentence = results["documents"][0][0]
# Return answer
return f"Answer (most relevant text):\n{best_sentence}"
# ------------------------------------------------------------
# 6. Build Gradio User Interface
# ------------------------------------------------------------
# Upload image component
image_input = gr.Image(label="Upload Image")
# Show extracted OCR text
ocr_output = gr.Textbox(label="Extracted / Stored Text")
# Ask question box
question_box = gr.Textbox(label="Ask a question about the image")
# Show answer
answer_box = gr.Textbox(label="Answer")
# Two tabs:
# Tab 1: Upload Image & Extract Text
# Tab 2: Ask Question about Image
app = gr.TabbedInterface(
[
gr.Interface(
fn=process_image,
inputs=image_input,
outputs=ocr_output,
title="Upload Image & Extract Text"
),
gr.Interface(
fn=answer_question,
inputs=question_box,
outputs=answer_box,
title="Ask Question About Image"
),
],
tab_names=["Upload Image", "Ask Question"]
)
# Start the web app
app.launch()
|