Spaces:

ARBAJSSHAIKH
/

fsa

Sleeping

fsa

File size: 4,515 Bytes

# ------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------

# OCR library to read text from images
import pytesseract

# (FOR WINDOWS USERS) explicitly set tesseract.exe location
# Change the path if Tesseract is installed somewhere else
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# For image loading and manipulation
from PIL import Image

# Vector database for storing embeddings locally
import chromadb

# Local sentence embedding model
from sentence_transformers import SentenceTransformer

# Simple web UI framework
import gradio as gr

# Create unique IDs for storing sentences
import uuid


# ------------------------------------------------------------
# 2. Load local embedding model
# ------------------------------------------------------------

# This model converts text into vectors (numbers)
# We use a small, fast model — runs on CPU
embedder = SentenceTransformer("all-MiniLM-L6-v2")


# ------------------------------------------------------------
# 3. Create local ChromaDB database
# ------------------------------------------------------------

# Create Chroma client (local DB in memory by default)

client = chromadb.CloudClient(
  api_key='ck-3TKpYcZnQiMFRYMs5XPusnJjcwJ1DekHF5eAK6Eixg3i',
  tenant='a8aa043d-7905-4da1-9937-197415021b8c',
  database='TEST 1'
)

# Create or access a collection (like a table in DB)

collection = client.create_collection("image_rag_final2")


# ------------------------------------------------------------
# 4. Function: process image and extract text
# ------------------------------------------------------------

def process_image(image):

    # Convert uploaded numpy array image into PIL format
    img = Image.fromarray(image)

    # Run OCR to extract text from image
    text = pytesseract.image_to_string(img)

    # If no text found
    if text.strip() == "":
        return "No text detected in image."

    # Split OCR text into separate lines/sentences
    sentences = [s.strip() for s in text.split("\n") if s.strip()]

    # Convert each sentence to vector embedding
    embeddings = embedder.encode(sentences).tolist()

    # Generate unique ID for each sentence
    ids = [str(uuid.uuid4()) for _ in sentences]

    # Store sentences & embeddings into Chroma vector DB
    collection.add(
        documents=sentences,
        embeddings=embeddings,
        ids=ids
    )

    # Return extracted text so user can see it
    return "Image processed and stored. Extracted text:\n\n" + "\n".join(sentences)


# ------------------------------------------------------------
# 5. Function: answer questions based on stored image text
# ------------------------------------------------------------

def answer_question(question):

    # Ask user to type something
    if question.strip() == "":
        return "Please enter a question."

    # Convert question into embedding vector
    query_embedding = embedder.encode([question]).tolist()

    # Search top 1 similar text from ChromaDB
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=1
    )

    # If no images were uploaded before asking question
    if not results["documents"]:
        return "No data yet. Upload an image first."

    # Get the best matching sentence
    best_sentence = results["documents"][0][0]

    # Return answer
    return f"Answer (most relevant text):\n{best_sentence}"


# ------------------------------------------------------------
# 6. Build Gradio User Interface
# ------------------------------------------------------------

# Upload image component
image_input = gr.Image(label="Upload Image")

# Show extracted OCR text
ocr_output = gr.Textbox(label="Extracted / Stored Text")

# Ask question box
question_box = gr.Textbox(label="Ask a question about the image")

# Show answer
answer_box = gr.Textbox(label="Answer")


# Two tabs:
# Tab 1: Upload Image & Extract Text
# Tab 2: Ask Question about Image
app = gr.TabbedInterface(
    [
        gr.Interface(
            fn=process_image,
            inputs=image_input,
            outputs=ocr_output,
            title="Upload Image & Extract Text"
        ),
        gr.Interface(
            fn=answer_question,
            inputs=question_box,
            outputs=answer_box,
            title="Ask Question About Image"
        ),
    ],
    tab_names=["Upload Image", "Ask Question"]
)

# Start the web app
app.launch()