Thunder-rk commited on
Commit
3ba792f
·
verified ·
1 Parent(s): 592c814

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
4
+ from llama_index.llms.huggingface import HuggingFaceLLM
5
+ from llama_index.core.prompts.prompts import SimpleInputPrompt
6
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
7
+ from llama_index.embeddings.langchain import LangchainEmbedding
8
+ import torch
9
+
10
+ # Set the environment variable for HuggingFace token
11
+ os.environ["HF_TOKEN"] = Secret
12
+ # Streamlit app
13
+ st.title("PDF Data Extractor")
14
+
15
+ # File uploader for PDFs
16
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
17
+
18
+ # Text input for the query prompt
19
+ user_query = st.text_input("Enter your query")
20
+
21
+ # Button to trigger processing
22
+ if st.button("Extract Data"):
23
+ if uploaded_files and user_query:
24
+ # Save uploaded files
25
+ for uploaded_file in uploaded_files:
26
+ with open(os.path.join("./docs", uploaded_file.name), "wb") as f:
27
+ f.write(uploaded_file.getbuffer())
28
+
29
+ # Load documents from the specified directory
30
+ documents = SimpleDirectoryReader("./docs").load_data()
31
+
32
+ # Define system prompt for the LLM
33
+ system_prompt = """
34
+ You are a data extractor. Your goal is to analyze the given PDF document and extract the table containing information relevant to the user query.
35
+ """
36
+
37
+ # Define query wrapper prompt
38
+ query_wrapper_prompt = SimpleInputPrompt("{query_str}")
39
+
40
+ # Initialize the LLM
41
+ llm = HuggingFaceLLM(
42
+ context_window=4096,
43
+ max_new_tokens=256,
44
+ generate_kwargs={"temperature": 0.0, "do_sample": False},
45
+ system_prompt=system_prompt,
46
+ query_wrapper_prompt=query_wrapper_prompt,
47
+ tokenizer_name="gemma-1.1-2b-it",
48
+ model_name="gemma-1.1-2b-it",
49
+ device_map="auto",
50
+ model_kwargs={"torch_dtype": torch.float16}
51
+ )
52
+
53
+ st.write("LLM download successful")
54
+
55
+ # Initialize the embedding model
56
+ embed_model = LangchainEmbedding(
57
+ HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
58
+ )
59
+
60
+ # Create service context with appropriate configurations
61
+ service_context = ServiceContext.from_defaults(
62
+ chunk_size=1024,
63
+ llm=llm,
64
+ embed_model=embed_model
65
+ )
66
+
67
+ st.write("Before Vector Index")
68
+ index = VectorStoreIndex.from_documents(documents, service_context=service_context)
69
+ st.write("After Vector Index")
70
+
71
+ # Create query engine from the index
72
+ query_engine = index.as_query_engine()
73
+
74
+ # Execute query
75
+ response = query_engine.query(user_query)
76
+
77
+ # Display response
78
+ st.write("Generated Response:")
79
+ st.write(response)
80
+
81
+ else:
82
+ st.error("Please upload PDF files and enter a query.")
83
+