Akshay Kumar BM commited on
Commit
0f0dcaa
·
unverified ·
0 Parent(s):

Add files via upload

Browse files
Files changed (1) hide show
  1. app.py +283 -0
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import validators
2
+ import streamlit as st
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain_groq import ChatGroq
5
+ from langchain.chains.summarize import load_summarize_chain
6
+ from langchain_community.document_loaders import YoutubeLoader, UnstructuredURLLoader, PyPDFLoader, TextLoader
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain.schema import Document
9
+ import os
10
+ import tempfile
11
+ from dotenv import load_dotenv
12
+ import tiktoken
13
+ load_dotenv()
14
+
15
+ os.environ['LANGCHAIN_API_KEY']=os.getenv("LANGCHAIN_API_KEY")
16
+ os.environ['LANGCHAIN_TRACING_V2']="true"
17
+ os.environ['LANGCHAIN_PROJECT']="LangChain: Process Content from Multiple Sources"
18
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
19
+
20
+ st.set_page_config(page_title="LangChain: Process Content from Multiple Sources", page_icon="🦜")
21
+ st.title("🦜 LangChain: Process Content from Multiple Sources")
22
+
23
+ # Initialize session state for PDF page ranges
24
+ if 'pdf_page_ranges' not in st.session_state:
25
+ st.session_state.pdf_page_ranges = {}
26
+
27
+ # Function to count tokens
28
+ def count_tokens(text, model="gpt-3.5-turbo"):
29
+ encoding = tiktoken.encoding_for_model(model)
30
+ return len(encoding.encode(text))
31
+
32
+ # Function to estimate cost
33
+ def estimate_cost(total_tokens, model):
34
+ pricing = {
35
+ "llama3-8b-8192": 0.05,
36
+ "gemma2-9b-it": 0.05,
37
+ "mixtral-8x7b-32768": 0.10
38
+ }
39
+ return (total_tokens / 1000) * pricing[model]
40
+
41
+ # Function to calculate optimal chunk size
42
+ def calculate_chunk_size(text_length, model_context_length):
43
+ target_chunk_size = model_context_length // 3
44
+ return max(1000, min(target_chunk_size, model_context_length // 2))
45
+
46
+ # Sidebar for API key input and PDF page selection
47
+ with st.sidebar:
48
+ st.header("Configuration")
49
+ groq_api_key = st.text_input("Groq API Key", type="password")
50
+ model = st.selectbox("Select Model", ["llama3-8b-8192","gemma2-9b-it", "mixtral-8x7b-32768"])
51
+
52
+ st.header("PDF Settings")
53
+
54
+ # Main content
55
+ st.subheader('Select Sources to Process')
56
+ use_urls = st.checkbox("URLs (YouTube or websites)")
57
+ use_files = st.checkbox("File Upload (PDF or text files)")
58
+ use_text = st.checkbox("Text Input")
59
+
60
+ sources = {}
61
+
62
+ if use_urls:
63
+ sources['urls'] = st.text_area("Enter URLs (one per line)", placeholder="https://example.com\nhttps://youtube.com/watch?v=...")
64
+
65
+ if use_files:
66
+ uploaded_files = st.file_uploader("Upload PDF or text files", type=["pdf", "txt"], accept_multiple_files=True)
67
+ if uploaded_files:
68
+ sources['files'] = uploaded_files
69
+
70
+ # PDF page range selection
71
+ for uploaded_file in uploaded_files:
72
+ if uploaded_file.type == "application/pdf":
73
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
74
+ temp_file.write(uploaded_file.getvalue())
75
+ temp_file_path = temp_file.name
76
+
77
+ loader = PyPDFLoader(temp_file_path)
78
+ pdf_pages = loader.load()
79
+ total_pages = len(pdf_pages)
80
+
81
+ # Use the file name as a unique key for session state
82
+ file_key = f"pdf_range_{uploaded_file.name}"
83
+
84
+ # Initialize the range in session state if it doesn't exist
85
+ if file_key not in st.session_state.pdf_page_ranges:
86
+ st.session_state.pdf_page_ranges[file_key] = (1, total_pages)
87
+
88
+ with st.sidebar:
89
+ st.write(f"PDF: {uploaded_file.name}")
90
+ st.write(f"Total pages: {total_pages}")
91
+ if total_pages > 1:
92
+ page_range = st.slider(
93
+ f"Select page range for {uploaded_file.name}",
94
+ 1, total_pages,
95
+ value=st.session_state.pdf_page_ranges[file_key],
96
+ key=file_key
97
+ )
98
+ st.session_state.pdf_page_ranges[file_key] = page_range
99
+ else:
100
+ st.write("This PDF has only one page.")
101
+ st.session_state.pdf_page_ranges[file_key] = (1, 1)
102
+
103
+ # Clean up the temporary file
104
+ os.unlink(temp_file_path)
105
+
106
+ if use_text:
107
+ sources['text'] = st.text_area("Enter text content", placeholder="Paste your text here...")
108
+
109
+ # Predefined actions
110
+ predefined_actions = [
111
+ "Summarize", "Analyze", "Review", "Critique", "Explain",
112
+ "Paraphrase", "Simplify", "Elaborate", "Extract key points",
113
+ "Provide an overview", "Highlight main ideas", "Create an outline",
114
+ "Generate a report", "Identify themes", "List pros and cons",
115
+ "Fact-check", "Create study notes", "Generate questions"
116
+ ]
117
+
118
+ # Action selection
119
+ action_type = st.radio("Choose action type", ["Predefined", "Custom"])
120
+
121
+ if action_type == "Predefined":
122
+ action = st.selectbox("Select Action", predefined_actions)
123
+ else:
124
+ action = st.text_input("Enter Custom Action", placeholder="e.g., Summarize in bullet points")
125
+
126
+ # Templates
127
+ prompt_template = """
128
+ Provide a {action} of the following content:
129
+
130
+ Content: {text}
131
+
132
+ {action}:
133
+ """
134
+
135
+ refine_template = """
136
+ We have provided an existing {action} of the content: {existing_answer}
137
+
138
+ We have some additional content to incorporate: {text}
139
+
140
+ Given this new information, please refine and update the existing {action}.
141
+
142
+ Refined {action}:
143
+ """
144
+
145
+ prompt = PromptTemplate(input_variables=['text', 'action'], template=prompt_template)
146
+ refine_prompt = PromptTemplate(input_variables=['text', 'action', 'existing_answer'], template=refine_template)
147
+
148
+ # Process button
149
+ if st.button("Process Content"):
150
+ if not groq_api_key.strip():
151
+ st.error("Please provide your Groq API key in the sidebar.")
152
+ elif not sources:
153
+ st.error("Please select at least one source type and provide content.")
154
+ elif action_type == "Custom" and not action.strip():
155
+ st.error("Please enter a custom action.")
156
+ else:
157
+ try:
158
+ llm = ChatGroq(model=model, groq_api_key=groq_api_key)
159
+
160
+ all_docs = []
161
+ total_tokens = 0
162
+
163
+ with st.spinner(f"Processing... ({action.lower()})"):
164
+ if 'urls' in sources and sources['urls']:
165
+ url_list = [url.strip() for url in sources['urls'].split('\n') if url.strip()]
166
+ for url in url_list:
167
+ if not validators.url(url):
168
+ st.warning(f"Skipping invalid URL: {url}")
169
+ continue
170
+
171
+ if "youtube.com" in url or "youtu.be" in url:
172
+ loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
173
+ st.info(f"Processing YouTube video: {url}")
174
+ else:
175
+ loader = UnstructuredURLLoader(
176
+ urls=[url],
177
+ ssl_verify=False,
178
+ headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
179
+ )
180
+ st.info(f"Processing website content: {url}")
181
+
182
+ docs = loader.load()
183
+ all_docs.extend(docs)
184
+
185
+ if 'files' in sources and sources['files']:
186
+ for uploaded_file in sources['files']:
187
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
188
+ temp_file.write(uploaded_file.getvalue())
189
+ temp_file_path = temp_file.name
190
+
191
+ if uploaded_file.type == "application/pdf":
192
+ loader = PyPDFLoader(temp_file_path)
193
+ st.info(f"Processing PDF: {uploaded_file.name}")
194
+
195
+ pdf_pages = loader.load()
196
+ file_key = f"pdf_range_{uploaded_file.name}"
197
+ page_range = st.session_state.pdf_page_ranges[file_key]
198
+
199
+ selected_pages = pdf_pages[page_range[0]-1:page_range[1]]
200
+
201
+ chunk_size = calculate_chunk_size(sum(len(page.page_content) for page in selected_pages), 8192)
202
+ current_chunk = []
203
+ current_chunk_size = 0
204
+
205
+ for page in selected_pages:
206
+ page_size = len(page.page_content)
207
+ if current_chunk_size + page_size > chunk_size and current_chunk:
208
+ all_docs.append(Document(page_content="\n".join([p.page_content for p in current_chunk]), metadata={"source": uploaded_file.name}))
209
+ current_chunk = []
210
+ current_chunk_size = 0
211
+ current_chunk.append(page)
212
+ current_chunk_size += page_size
213
+
214
+ if current_chunk:
215
+ all_docs.append(Document(page_content="\n".join([p.page_content for p in current_chunk]), metadata={"source": uploaded_file.name}))
216
+ else:
217
+ loader = TextLoader(temp_file_path)
218
+ st.info(f"Processing text file: {uploaded_file.name}")
219
+ docs = loader.load()
220
+ all_docs.extend(docs)
221
+
222
+ os.unlink(temp_file_path)
223
+
224
+ if 'text' in sources and sources['text']:
225
+ with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt") as temp_file:
226
+ temp_file.write(sources['text'])
227
+ temp_file_path = temp_file.name
228
+
229
+ loader = TextLoader(temp_file_path)
230
+ docs = loader.load()
231
+ all_docs.extend(docs)
232
+ st.info("Processing text input")
233
+
234
+ os.unlink(temp_file_path)
235
+
236
+ if not all_docs:
237
+ st.error("No content was processed. Please check your inputs and try again.")
238
+
239
+
240
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=calculate_chunk_size(sum(len(doc.page_content) for doc in all_docs), 8192), chunk_overlap=200)
241
+ split_docs = []
242
+ for doc in all_docs:
243
+ if doc.metadata.get("source", "").lower().endswith(".pdf"):
244
+ split_docs.append(doc)
245
+ else:
246
+ split_docs.extend(text_splitter.split_documents([doc]))
247
+
248
+ for doc in split_docs:
249
+ total_tokens += count_tokens(doc.page_content)
250
+
251
+ total_tokens += count_tokens(prompt_template) * len(split_docs)
252
+ total_tokens += count_tokens(refine_template) * (len(split_docs) - 1)
253
+
254
+ estimated_cost = estimate_cost(total_tokens, model)
255
+
256
+ st.info(f"Estimated cost for processing: ${estimated_cost:.4f}")
257
+
258
+ chain = load_summarize_chain(
259
+ llm=llm,
260
+ chain_type="refine",
261
+ question_prompt=prompt,
262
+ refine_prompt=refine_prompt
263
+ )
264
+
265
+ output = chain.run(input_documents=split_docs, action=action.lower())
266
+
267
+ output_tokens = count_tokens(output)
268
+ total_tokens += output_tokens
269
+
270
+ final_cost = estimate_cost(total_tokens, model)
271
+
272
+ st.success("Processing complete!")
273
+ st.subheader(f"{action} Result")
274
+ st.write(output)
275
+
276
+ st.info(f"Total tokens processed: {total_tokens}")
277
+ st.info(f"Final cost for processing: ${final_cost:.4f}")
278
+
279
+ except Exception as e:
280
+ st.error(f"An error occurred: {str(e)}")
281
+
282
+ st.divider()
283
+ st.caption("Powered by LangChain and Groq")