Spaces:

Thunder-rk
/

RAG

Sleeping

App Files Files Community

Thunder-rk commited on Jun 5, 2024

Commit

3ba792f

verified ·

1 Parent(s): 592c814

Create app.py

Browse files

Files changed (1) hide show

app.py +83 -0

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import streamlit as st
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
+from llama_index.llms.huggingface import HuggingFaceLLM
+from llama_index.core.prompts.prompts import SimpleInputPrompt
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from llama_index.embeddings.langchain import LangchainEmbedding
+import torch
+# Set the environment variable for HuggingFace token
+os.environ["HF_TOKEN"] = Secret
+# Streamlit app
+st.title("PDF Data Extractor")
+# File uploader for PDFs
+uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
+# Text input for the query prompt
+user_query = st.text_input("Enter your query")
+# Button to trigger processing
+if st.button("Extract Data"):
+    if uploaded_files and user_query:
+        # Save uploaded files
+        for uploaded_file in uploaded_files:
+            with open(os.path.join("./docs", uploaded_file.name), "wb") as f:
+                f.write(uploaded_file.getbuffer())
+        # Load documents from the specified directory
+        documents = SimpleDirectoryReader("./docs").load_data()
+        # Define system prompt for the LLM
+        system_prompt = """
+        You are a data extractor. Your goal is to analyze the given PDF document and extract the table containing information relevant to the user query.
+        """
+        # Define query wrapper prompt
+        query_wrapper_prompt = SimpleInputPrompt("{query_str}")
+        # Initialize the LLM
+        llm = HuggingFaceLLM(
+            context_window=4096,
+            max_new_tokens=256,
+            generate_kwargs={"temperature": 0.0, "do_sample": False},
+            system_prompt=system_prompt,
+            query_wrapper_prompt=query_wrapper_prompt,
+            tokenizer_name="gemma-1.1-2b-it",
+            model_name="gemma-1.1-2b-it",
+            device_map="auto",
+            model_kwargs={"torch_dtype": torch.float16}
+        )
+        st.write("LLM download successful")
+        # Initialize the embedding model
+        embed_model = LangchainEmbedding(
+            HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+        )
+        # Create service context with appropriate configurations
+        service_context = ServiceContext.from_defaults(
+            chunk_size=1024,
+            llm=llm,
+            embed_model=embed_model
+        )
+        st.write("Before Vector Index")
+        index = VectorStoreIndex.from_documents(documents, service_context=service_context)
+        st.write("After Vector Index")
+        # Create query engine from the index
+        query_engine = index.as_query_engine()
+        # Execute query
+        response = query_engine.query(user_query)
+        # Display response
+        st.write("Generated Response:")
+        st.write(response)
+    else:
+        st.error("Please upload PDF files and enter a query.")