Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import requests | |
| import pdfplumber | |
| import streamlit as st | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| # Constants | |
| DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing" | |
| CHUNK_SIZE = 500 | |
| # Function to download document | |
| def download_document(file_url): | |
| file_id = file_url.split("/d/")[1].split("/")[0] | |
| download_url = f"https://drive.google.com/uc?export=download&id={file_id}" | |
| response = requests.get(download_url) | |
| output = "document.pdf" | |
| with open(output, "wb") as f: | |
| f.write(response.content) | |
| return output | |
| # Extract text from PDF | |
| def extract_text_from_pdf(file_path): | |
| text = "" | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() | |
| return text | |
| # Chunk text into smaller parts | |
| def chunk_text(text, chunk_size=CHUNK_SIZE): | |
| sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
| chunks, current_chunk = [], "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) < chunk_size: | |
| current_chunk += sentence + " " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Vectorize and store in FAISS | |
| def create_faiss_index(chunks, model): | |
| embeddings = model.encode(chunks) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index, embeddings | |
| # Query FAISS index | |
| def query_faiss(query, index, chunks, model, k=5): | |
| query_embedding = model.encode([query]) | |
| distances, indices = index.search(query_embedding, k) | |
| return [chunks[i] for i in indices[0]] | |
| # Streamlit application | |
| def main(): | |
| st.title("Document-Based Query Application") | |
| st.write("This application uses a pre-configured document as the dataset for answering queries.") | |
| # Download and process the document | |
| st.write("Processing the pre-configured document...") | |
| document_path = download_document(DOCUMENT_URL) | |
| text = extract_text_from_pdf(document_path) | |
| chunks = chunk_text(text) | |
| # Create FAISS index | |
| st.write("Creating FAISS index...") | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| index, embeddings = create_faiss_index(chunks, embedding_model) | |
| st.success("Document processed and indexed!") | |
| # Query the database | |
| query = st.text_input("Enter your query") | |
| if query: | |
| st.write("Fetching relevant content from the document...") | |
| results = query_faiss(query, index, chunks, embedding_model) | |
| st.write("Top relevant chunks:") | |
| for i, result in enumerate(results): | |
| st.write(f"{i+1}. {result}") | |
| if __name__ == "__main__": | |
| main() | |