File size: 4,515 Bytes
0a778db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f77d199
0a778db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# ------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------

# OCR library to read text from images
import pytesseract

# (FOR WINDOWS USERS) explicitly set tesseract.exe location
# Change the path if Tesseract is installed somewhere else
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# For image loading and manipulation
from PIL import Image

# Vector database for storing embeddings locally
import chromadb

# Local sentence embedding model
from sentence_transformers import SentenceTransformer

# Simple web UI framework
import gradio as gr

# Create unique IDs for storing sentences
import uuid


# ------------------------------------------------------------
# 2. Load local embedding model
# ------------------------------------------------------------

# This model converts text into vectors (numbers)
# We use a small, fast model — runs on CPU
embedder = SentenceTransformer("all-MiniLM-L6-v2")


# ------------------------------------------------------------
# 3. Create local ChromaDB database
# ------------------------------------------------------------

# Create Chroma client (local DB in memory by default)

client = chromadb.CloudClient(
  api_key='ck-3TKpYcZnQiMFRYMs5XPusnJjcwJ1DekHF5eAK6Eixg3i',
  tenant='a8aa043d-7905-4da1-9937-197415021b8c',
  database='TEST 1'
)

# Create or access a collection (like a table in DB)

collection = client.create_collection("image_rag_final2")


# ------------------------------------------------------------
# 4. Function: process image and extract text
# ------------------------------------------------------------

def process_image(image):

    # Convert uploaded numpy array image into PIL format
    img = Image.fromarray(image)

    # Run OCR to extract text from image
    text = pytesseract.image_to_string(img)

    # If no text found
    if text.strip() == "":
        return "No text detected in image."

    # Split OCR text into separate lines/sentences
    sentences = [s.strip() for s in text.split("\n") if s.strip()]

    # Convert each sentence to vector embedding
    embeddings = embedder.encode(sentences).tolist()

    # Generate unique ID for each sentence
    ids = [str(uuid.uuid4()) for _ in sentences]

    # Store sentences & embeddings into Chroma vector DB
    collection.add(
        documents=sentences,
        embeddings=embeddings,
        ids=ids
    )

    # Return extracted text so user can see it
    return "Image processed and stored. Extracted text:\n\n" + "\n".join(sentences)


# ------------------------------------------------------------
# 5. Function: answer questions based on stored image text
# ------------------------------------------------------------

def answer_question(question):

    # Ask user to type something
    if question.strip() == "":
        return "Please enter a question."

    # Convert question into embedding vector
    query_embedding = embedder.encode([question]).tolist()

    # Search top 1 similar text from ChromaDB
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=1
    )

    # If no images were uploaded before asking question
    if not results["documents"]:
        return "No data yet. Upload an image first."

    # Get the best matching sentence
    best_sentence = results["documents"][0][0]

    # Return answer
    return f"Answer (most relevant text):\n{best_sentence}"


# ------------------------------------------------------------
# 6. Build Gradio User Interface
# ------------------------------------------------------------

# Upload image component
image_input = gr.Image(label="Upload Image")

# Show extracted OCR text
ocr_output = gr.Textbox(label="Extracted / Stored Text")

# Ask question box
question_box = gr.Textbox(label="Ask a question about the image")

# Show answer
answer_box = gr.Textbox(label="Answer")


# Two tabs:
# Tab 1: Upload Image & Extract Text
# Tab 2: Ask Question about Image
app = gr.TabbedInterface(
    [
        gr.Interface(
            fn=process_image,
            inputs=image_input,
            outputs=ocr_output,
            title="Upload Image & Extract Text"
        ),
        gr.Interface(
            fn=answer_question,
            inputs=question_box,
            outputs=answer_box,
            title="Ask Question About Image"
        ),
    ],
    tab_names=["Upload Image", "Ask Question"]
)

# Start the web app
app.launch()