Spaces:
Sleeping
Sleeping
| # ------------------------------------------------------------ | |
| # 1. Import libraries | |
| # ------------------------------------------------------------ | |
| # OCR library to read text from images | |
| import pytesseract | |
| # (FOR WINDOWS USERS) explicitly set tesseract.exe location | |
| # Change the path if Tesseract is installed somewhere else | |
| pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
| # For image loading and manipulation | |
| from PIL import Image | |
| # Vector database for storing embeddings locally | |
| import chromadb | |
| # Local sentence embedding model | |
| from sentence_transformers import SentenceTransformer | |
| # Simple web UI framework | |
| import gradio as gr | |
| # Create unique IDs for storing sentences | |
| import uuid | |
| # ------------------------------------------------------------ | |
| # 2. Load local embedding model | |
| # ------------------------------------------------------------ | |
| # This model converts text into vectors (numbers) | |
| # We use a small, fast model — runs on CPU | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # ------------------------------------------------------------ | |
| # 3. Create local ChromaDB database | |
| # ------------------------------------------------------------ | |
| # Create Chroma client (local DB in memory by default) | |
| client = chromadb.CloudClient( | |
| api_key='ck-3TKpYcZnQiMFRYMs5XPusnJjcwJ1DekHF5eAK6Eixg3i', | |
| tenant='a8aa043d-7905-4da1-9937-197415021b8c', | |
| database='TEST 1' | |
| ) | |
| # Create or access a collection (like a table in DB) | |
| collection = client.create_collection("image_rag_final2") | |
| # ------------------------------------------------------------ | |
| # 4. Function: process image and extract text | |
| # ------------------------------------------------------------ | |
| def process_image(image): | |
| # Convert uploaded numpy array image into PIL format | |
| img = Image.fromarray(image) | |
| # Run OCR to extract text from image | |
| text = pytesseract.image_to_string(img) | |
| # If no text found | |
| if text.strip() == "": | |
| return "No text detected in image." | |
| # Split OCR text into separate lines/sentences | |
| sentences = [s.strip() for s in text.split("\n") if s.strip()] | |
| # Convert each sentence to vector embedding | |
| embeddings = embedder.encode(sentences).tolist() | |
| # Generate unique ID for each sentence | |
| ids = [str(uuid.uuid4()) for _ in sentences] | |
| # Store sentences & embeddings into Chroma vector DB | |
| collection.add( | |
| documents=sentences, | |
| embeddings=embeddings, | |
| ids=ids | |
| ) | |
| # Return extracted text so user can see it | |
| return "Image processed and stored. Extracted text:\n\n" + "\n".join(sentences) | |
| # ------------------------------------------------------------ | |
| # 5. Function: answer questions based on stored image text | |
| # ------------------------------------------------------------ | |
| def answer_question(question): | |
| # Ask user to type something | |
| if question.strip() == "": | |
| return "Please enter a question." | |
| # Convert question into embedding vector | |
| query_embedding = embedder.encode([question]).tolist() | |
| # Search top 1 similar text from ChromaDB | |
| results = collection.query( | |
| query_embeddings=query_embedding, | |
| n_results=1 | |
| ) | |
| # If no images were uploaded before asking question | |
| if not results["documents"]: | |
| return "No data yet. Upload an image first." | |
| # Get the best matching sentence | |
| best_sentence = results["documents"][0][0] | |
| # Return answer | |
| return f"Answer (most relevant text):\n{best_sentence}" | |
| # ------------------------------------------------------------ | |
| # 6. Build Gradio User Interface | |
| # ------------------------------------------------------------ | |
| # Upload image component | |
| image_input = gr.Image(label="Upload Image") | |
| # Show extracted OCR text | |
| ocr_output = gr.Textbox(label="Extracted / Stored Text") | |
| # Ask question box | |
| question_box = gr.Textbox(label="Ask a question about the image") | |
| # Show answer | |
| answer_box = gr.Textbox(label="Answer") | |
| # Two tabs: | |
| # Tab 1: Upload Image & Extract Text | |
| # Tab 2: Ask Question about Image | |
| app = gr.TabbedInterface( | |
| [ | |
| gr.Interface( | |
| fn=process_image, | |
| inputs=image_input, | |
| outputs=ocr_output, | |
| title="Upload Image & Extract Text" | |
| ), | |
| gr.Interface( | |
| fn=answer_question, | |
| inputs=question_box, | |
| outputs=answer_box, | |
| title="Ask Question About Image" | |
| ), | |
| ], | |
| tab_names=["Upload Image", "Ask Question"] | |
| ) | |
| # Start the web app | |
| app.launch() | |