DrishtiSharma commited on
Commit
818f4ab
·
verified ·
1 Parent(s): 512b711

Delete test.py

Browse files
Files changed (1) hide show
  1. test.py +0 -332
test.py DELETED
@@ -1,332 +0,0 @@
1
- # to-do: make it multilingual
2
- import streamlit as st
3
- import os
4
- from openai import OpenAI
5
- import tempfile
6
- from langchain.chains import ConversationalRetrievalChain
7
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain_community.vectorstores import Chroma
10
- from langchain_community.document_loaders import (
11
- PyPDFLoader,
12
- TextLoader,
13
- CSVLoader
14
- )
15
- from datetime import datetime
16
- from pydub import AudioSegment
17
- import pytz
18
-
19
- from langchain.chains import ConversationalRetrievalChain
20
- from langchain.text_splitter import RecursiveCharacterTextSplitter
21
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
22
- from langchain_community.vectorstores import Chroma
23
- from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader
24
- import os
25
- import tempfile
26
- from datetime import datetime
27
- import pytz
28
-
29
-
30
- class DocumentRAG:
31
- def __init__(self):
32
- self.document_store = None
33
- self.qa_chain = None
34
- self.document_summary = ""
35
- self.chat_history = []
36
- self.last_processed_time = None
37
- self.api_key = os.getenv("OPENAI_API_KEY") # Fetch the API key from environment variable
38
- self.init_time = datetime.now(pytz.UTC)
39
-
40
- if not self.api_key:
41
- raise ValueError("API Key not found. Make sure to set the 'OPENAI_API_KEY' environment variable.")
42
-
43
- # Persistent directory for Chroma to avoid tenant-related errors
44
- self.chroma_persist_dir = "./chroma_storage"
45
- os.makedirs(self.chroma_persist_dir, exist_ok=True)
46
-
47
- def process_documents(self, uploaded_files):
48
- """Process uploaded files by saving them temporarily and extracting content."""
49
- if not self.api_key:
50
- return "Please set the OpenAI API key in the environment variables."
51
- if not uploaded_files:
52
- return "Please upload documents first."
53
-
54
- try:
55
- documents = []
56
- for uploaded_file in uploaded_files:
57
- # Save uploaded file to a temporary location
58
- temp_file_path = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]).name
59
- with open(temp_file_path, "wb") as temp_file:
60
- temp_file.write(uploaded_file.read())
61
-
62
- # Determine the loader based on the file type
63
- if temp_file_path.endswith('.pdf'):
64
- loader = PyPDFLoader(temp_file_path)
65
- elif temp_file_path.endswith('.txt'):
66
- loader = TextLoader(temp_file_path)
67
- elif temp_file_path.endswith('.csv'):
68
- loader = CSVLoader(temp_file_path)
69
- else:
70
- return f"Unsupported file type: {uploaded_file.name}"
71
-
72
- # Load the documents
73
- try:
74
- documents.extend(loader.load())
75
- except Exception as e:
76
- return f"Error loading {uploaded_file.name}: {str(e)}"
77
-
78
- if not documents:
79
- return "No valid documents were processed. Please check your files."
80
-
81
- # Split text for better processing
82
- text_splitter = RecursiveCharacterTextSplitter(
83
- chunk_size=1000,
84
- chunk_overlap=200,
85
- length_function=len
86
- )
87
- documents = text_splitter.split_documents(documents)
88
-
89
- # Combine text for summary
90
- combined_text = " ".join([doc.page_content for doc in documents])
91
- self.document_summary = self.generate_summary(combined_text)
92
-
93
- # Create embeddings and initialize retrieval chain
94
- embeddings = OpenAIEmbeddings(api_key=self.api_key)
95
- self.document_store = Chroma.from_documents(
96
- documents,
97
- embeddings,
98
- persist_directory=self.chroma_persist_dir # Persistent directory for Chroma
99
- )
100
-
101
- self.qa_chain = ConversationalRetrievalChain.from_llm(
102
- ChatOpenAI(temperature=0, model_name='gpt-4', api_key=self.api_key),
103
- self.document_store.as_retriever(search_kwargs={'k': 6}),
104
- return_source_documents=True,
105
- verbose=False
106
- )
107
-
108
- self.last_processed_time = datetime.now(pytz.UTC)
109
- return "Documents processed successfully!"
110
- except Exception as e:
111
- return f"Error processing documents: {str(e)}"
112
-
113
- def generate_summary(self, text):
114
- """Generate a summary of the provided text."""
115
- if not self.api_key:
116
- return "API Key not set. Please set it in the environment variables."
117
- try:
118
- client = OpenAI(api_key=self.api_key)
119
- response = client.chat.completions.create(
120
- model="gpt-4",
121
- messages=[
122
- {"role": "system", "content": "Summarize the document content concisely and provide 3-5 key points for discussion."},
123
- {"role": "user", "content": text[:4000]}
124
- ],
125
- temperature=0.3
126
- )
127
- return response.choices[0].message.content
128
- except Exception as e:
129
- return f"Error generating summary: {str(e)}"
130
-
131
- def create_podcast(self):
132
- """Generate a podcast script and audio based on the document summary."""
133
- if not self.document_summary:
134
- return "Please process documents before generating a podcast.", None
135
-
136
- if not self.api_key:
137
- return "Please set the OpenAI API key in the environment variables.", None
138
-
139
- try:
140
- client = OpenAI(api_key=self.api_key)
141
-
142
- # Generate podcast script
143
- script_response = client.chat.completions.create(
144
- model="gpt-4",
145
- messages=[
146
- {"role": "system", "content": "You are a professional podcast producer. Create a natural dialogue based on the provided document summary."},
147
- {"role": "user", "content": f"""Based on the following document summary, create a 1-2 minute podcast script:
148
- 1. Clearly label the dialogue as 'Host 1:' and 'Host 2:'
149
- 2. Keep the content engaging and insightful.
150
- 3. Use conversational language suitable for a podcast.
151
- 4. Ensure the script has a clear opening and closing.
152
- Document Summary: {self.document_summary}"""}
153
- ],
154
- temperature=0.7
155
- )
156
-
157
- script = script_response.choices[0].message.content
158
- if not script:
159
- return "Error: Failed to generate podcast script.", None
160
-
161
- # Convert script to audio
162
- final_audio = AudioSegment.empty()
163
- is_first_speaker = True
164
-
165
- lines = [line.strip() for line in script.split("\n") if line.strip()]
166
- for line in lines:
167
- if ":" not in line:
168
- continue
169
-
170
- speaker, text = line.split(":", 1)
171
- if not text.strip():
172
- continue
173
-
174
- try:
175
- voice = "nova" if is_first_speaker else "onyx"
176
- audio_response = client.audio.speech.create(
177
- model="tts-1",
178
- voice=voice,
179
- input=text.strip()
180
- )
181
-
182
- temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
183
- audio_response.stream_to_file(temp_audio_file.name)
184
-
185
- segment = AudioSegment.from_file(temp_audio_file.name)
186
- final_audio += segment
187
- final_audio += AudioSegment.silent(duration=300)
188
-
189
- is_first_speaker = not is_first_speaker
190
- except Exception as e:
191
- print(f"Error generating audio for line: {text}")
192
- print(f"Details: {e}")
193
- continue
194
-
195
- if len(final_audio) == 0:
196
- return "Error: No audio could be generated.", None
197
-
198
- output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
199
- final_audio.export(output_file, format="mp3")
200
- return script, output_file
201
-
202
- except Exception as e:
203
- return f"Error generating podcast: {str(e)}", None
204
-
205
- def generate_summary(self, text):
206
- """Generate a summary of the provided text."""
207
- if not self.api_key:
208
- return "API Key not set. Please set it in the environment variables."
209
- try:
210
- client = OpenAI(api_key=self.api_key)
211
- response = client.chat.completions.create(
212
- model="gpt-4",
213
- messages=[
214
- {"role": "system", "content": "Summarize the document content concisely and provide 3-5 key points for discussion."},
215
- {"role": "user", "content": text[:4000]}
216
- ],
217
- temperature=0.3
218
- )
219
- return response.choices[0].message.content
220
- except Exception as e:
221
- return f"Error generating summary: {str(e)}"
222
-
223
- def handle_query(self, question, history):
224
- """Handle user queries."""
225
- if not self.qa_chain:
226
- return history + [("System", "Please process the documents first.")]
227
- try:
228
- preface = """
229
- Instruction: Respond in English. Be professional and concise, keeping the response under 300 words.
230
- If you cannot provide an answer, say: "I am not sure about this question. Please try asking something else."
231
- """
232
- query = f"{preface}\nQuery: {question}"
233
-
234
- result = self.qa_chain({
235
- "question": query,
236
- "chat_history": [(q, a) for q, a in history]
237
- })
238
-
239
- if "answer" not in result:
240
- return history + [("System", "Sorry, an error occurred.")]
241
-
242
- history.append((question, result["answer"]))
243
- return history
244
- except Exception as e:
245
- return history + [("System", f"Error: {str(e)}")]
246
-
247
- # Initialize RAG system in session state
248
- if "rag_system" not in st.session_state:
249
- st.session_state.rag_system = DocumentRAG()
250
-
251
- # Sidebar
252
- with st.sidebar:
253
- st.title("About")
254
- st.markdown(
255
- """
256
- This app is inspired by the [RAG_HW HuggingFace Space](https://huggingface.co/spaces/wint543/RAG_HW).
257
- It allows users to upload documents, generate summaries, ask questions, and create podcasts.
258
- """
259
- )
260
- st.markdown("### Steps:")
261
- st.markdown("1. Upload documents.")
262
- st.markdown("2. Generate summaries.")
263
- st.markdown("3. Ask questions.")
264
- st.markdown("4. Create podcasts.")
265
-
266
- # Streamlit UI
267
- # Sidebar
268
- #with st.sidebar:
269
- #st.title("About")
270
- #st.markdown(
271
- #"""
272
- #This app is inspired by the [RAG_HW HuggingFace Space](https://huggingface.co/spaces/wint543/RAG_HW).
273
- #It allows users to:
274
- #1. Upload and process documents
275
- #2. Generate summaries
276
- #3. Ask questions
277
- #4. Create podcasts
278
- #"""
279
- #)
280
-
281
- # Main App
282
- st.title("Document Analyzer and Podcast Generator")
283
-
284
- # Step 1: Upload and Process Documents
285
- st.subheader("Step 1: Upload and Process Documents")
286
- uploaded_files = st.file_uploader("Upload files (PDF, TXT, CSV)", accept_multiple_files=True)
287
-
288
- if st.button("Process Documents"):
289
- if uploaded_files:
290
- # Process the uploaded files
291
- result = st.session_state.rag_system.process_documents(uploaded_files)
292
- if "successfully" in result:
293
- st.success(result)
294
- else:
295
- st.error(result)
296
- else:
297
- st.warning("No files uploaded.")
298
-
299
- # Step 2: Generate Summaries
300
- st.subheader("Step 2: Generate Summaries")
301
- if st.session_state.rag_system.document_summary:
302
- st.text_area("Document Summary", st.session_state.rag_system.document_summary, height=200)
303
- else:
304
- st.info("Please process documents first to generate summaries.")
305
-
306
- # Step 3: Ask Questions
307
- st.subheader("Step 3: Ask Questions")
308
- if st.session_state.rag_system.qa_chain:
309
- history = []
310
- user_question = st.text_input("Ask a question:")
311
- if st.button("Submit Question"):
312
- # Handle the user query
313
- history = st.session_state.rag_system.handle_query(user_question, history)
314
- for question, answer in history:
315
- st.chat_message("user").write(question)
316
- st.chat_message("assistant").write(answer)
317
- else:
318
- st.info("Please process documents first to enable Q&A.")
319
-
320
- # Step 4: Generate Podcast
321
- st.subheader("Step 4: Generate Podcast")
322
- if st.session_state.rag_system.document_summary:
323
- if st.button("Generate Podcast"):
324
- script, audio_path = st.session_state.rag_system.create_podcast()
325
- if audio_path:
326
- st.text_area("Generated Podcast Script", script, height=200)
327
- st.audio(audio_path, format="audio/mp3")
328
- st.success("Podcast generated successfully! You can listen to it above.")
329
- else:
330
- st.error(script)
331
- else:
332
- st.info("Please process documents and generate summaries before creating a podcast.")