# ------------------------------------------------------------ # 1. Import libraries # ------------------------------------------------------------ # OCR library to read text from images import pytesseract # (FOR WINDOWS USERS) explicitly set tesseract.exe location # Change the path if Tesseract is installed somewhere else pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # For image loading and manipulation from PIL import Image # Vector database for storing embeddings locally import chromadb # Local sentence embedding model from sentence_transformers import SentenceTransformer # Simple web UI framework import gradio as gr # Create unique IDs for storing sentences import uuid # ------------------------------------------------------------ # 2. Load local embedding model # ------------------------------------------------------------ # This model converts text into vectors (numbers) # We use a small, fast model — runs on CPU embedder = SentenceTransformer("all-MiniLM-L6-v2") # ------------------------------------------------------------ # 3. Create local ChromaDB database # ------------------------------------------------------------ # Create Chroma client (local DB in memory by default) client = chromadb.CloudClient( api_key='ck-3TKpYcZnQiMFRYMs5XPusnJjcwJ1DekHF5eAK6Eixg3i', tenant='a8aa043d-7905-4da1-9937-197415021b8c', database='TEST 1' ) # Create or access a collection (like a table in DB) collection = client.create_collection("image_rag_final2") # ------------------------------------------------------------ # 4. Function: process image and extract text # ------------------------------------------------------------ def process_image(image): # Convert uploaded numpy array image into PIL format img = Image.fromarray(image) # Run OCR to extract text from image text = pytesseract.image_to_string(img) # If no text found if text.strip() == "": return "No text detected in image." # Split OCR text into separate lines/sentences sentences = [s.strip() for s in text.split("\n") if s.strip()] # Convert each sentence to vector embedding embeddings = embedder.encode(sentences).tolist() # Generate unique ID for each sentence ids = [str(uuid.uuid4()) for _ in sentences] # Store sentences & embeddings into Chroma vector DB collection.add( documents=sentences, embeddings=embeddings, ids=ids ) # Return extracted text so user can see it return "Image processed and stored. Extracted text:\n\n" + "\n".join(sentences) # ------------------------------------------------------------ # 5. Function: answer questions based on stored image text # ------------------------------------------------------------ def answer_question(question): # Ask user to type something if question.strip() == "": return "Please enter a question." # Convert question into embedding vector query_embedding = embedder.encode([question]).tolist() # Search top 1 similar text from ChromaDB results = collection.query( query_embeddings=query_embedding, n_results=1 ) # If no images were uploaded before asking question if not results["documents"]: return "No data yet. Upload an image first." # Get the best matching sentence best_sentence = results["documents"][0][0] # Return answer return f"Answer (most relevant text):\n{best_sentence}" # ------------------------------------------------------------ # 6. Build Gradio User Interface # ------------------------------------------------------------ # Upload image component image_input = gr.Image(label="Upload Image") # Show extracted OCR text ocr_output = gr.Textbox(label="Extracted / Stored Text") # Ask question box question_box = gr.Textbox(label="Ask a question about the image") # Show answer answer_box = gr.Textbox(label="Answer") # Two tabs: # Tab 1: Upload Image & Extract Text # Tab 2: Ask Question about Image app = gr.TabbedInterface( [ gr.Interface( fn=process_image, inputs=image_input, outputs=ocr_output, title="Upload Image & Extract Text" ), gr.Interface( fn=answer_question, inputs=question_box, outputs=answer_box, title="Ask Question About Image" ), ], tab_names=["Upload Image", "Ask Question"] ) # Start the web app app.launch()