Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import faiss | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import requests | |
| import streamlit as st | |
| from groq import Groq | |
| # Set up Groq client | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_path): | |
| images = convert_from_path(pdf_path) | |
| text = "" | |
| for page in images: | |
| text += pytesseract.image_to_string(page) | |
| return text | |
| # Function to chunk the text | |
| def create_chunks(text, chunk_size=200): | |
| words = text.split() | |
| chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] | |
| return chunks | |
| # Function to store chunks in FAISS (GPU enabled) | |
| def store_chunks_in_faiss(chunks): | |
| vector_dim = 768 # Assuming embeddings are 768-dimensional | |
| index = faiss.IndexFlatL2(vector_dim) | |
| # Move index to GPU if available | |
| res = faiss.StandardGpuResources() | |
| index = faiss.index_cpu_to_gpu(res, 0, index) | |
| # Generate dummy embeddings for demonstration | |
| embeddings = np.random.rand(len(chunks), vector_dim).astype("float32") | |
| index.add(embeddings) | |
| return index | |
| # Check if FAISS is using GPU | |
| def is_gpu_available(): | |
| return faiss.get_num_gpus() > 0 | |
| # Streamlit app interface | |
| st.title("PDF Content Chunking and Retrieval with FAISS-GPU") | |
| # PDF upload | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file: | |
| st.write("Processing the uploaded file...") | |
| with open("uploaded_file.pdf", "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| # Extract text | |
| extracted_text = extract_text_from_pdf("uploaded_file.pdf") | |
| st.text_area("Extracted Text", extracted_text, height=200) | |
| # Chunk text | |
| st.write("Creating chunks...") | |
| chunks = create_chunks(extracted_text) | |
| st.write(f"Total chunks created: {len(chunks)}") | |
| # Store chunks in FAISS | |
| st.write("Storing chunks in FAISS...") | |
| index = store_chunks_in_faiss(chunks) | |
| if is_gpu_available(): | |
| st.success("FAISS is using GPU resources!") | |
| else: | |
| st.warning("FAISS is running on CPU.") | |
| st.write("Chunks successfully stored in the FAISS index!") | |
| # Interaction with Groq | |
| user_input = st.text_input("Ask a question about the content:") | |
| if user_input: | |
| st.write("Sending query to Groq API...") | |
| response = client.chat.completions.create( | |
| messages=[{"role": "user", "content": user_input}], | |
| model="llama-3.3-70b-versatile" | |
| ) | |
| st.text_area("Groq API Response", response.choices[0].message.content, height=100) | |