ChatWithYourPDF

Runtime error

App Files Files Community

ToddLLM commited on Mar 18, 2024

Commit

a44b0d7

1 Parent(s): 95b96e9

fix pdf handling

Browse files

Files changed (1) hide show

app.py +42 -48

app.py CHANGED Viewed

@@ -46,72 +46,65 @@ prompt = ChatPromptTemplate.from_messages(messages)
 chain_type_kwargs = {"prompt": prompt}
-def process_file(file: cl.AskFileMessage):
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
-        with open(tempfile.name, "wb") as f:
-            f.write(file.content)
-    pypdf_loader = PyPDFLoader(tempfile.name)
-    texts = pypdf_loader.load_and_split()
-    texts = [text.page_content for text in texts]
     return texts
 @cl.on_chat_start
 async def on_chat_start():
-    files = None
-    # Wait for the user to upload a file
-    while files is None:
-        # Note: This now accepts both text/plain and application/pdf files
         files = await cl.AskFileMessage(
             content="Please upload a text or PDF file to begin!",
-            accept=["text/plain", "application/pdf"],
-            max_size_mb=20,  # Assuming PDFs might be larger
             timeout=180,
         ).send()
-    file = files[0]
-    # Notify the user that their file is being processed
-    msg = cl.Message(content=f"Processing `{file.name}`...")
-    await msg.send()
-    # Initialize an empty list for texts, this will be populated based on file type
     texts = []
-    # Check the file type and process accordingly
-    if file.content_type == "text/plain":
-        # Handle text file
         with open(file.path, "r", encoding="utf-8") as f:
             text = f.read()
-        texts.append(text)  # Add the text to the texts list
-        # Update the user about the text file
-        await cl.Message(
-            content=f"`{file.name}` uploaded, it contains {len(text)} characters!"
-        ).send()
-    elif file.content_type == "application/pdf":
-        # Handle PDF file
-        texts = process_file(file)  # Assuming process_file() is a function you've defined to extract text from PDF
-        # Create metadata for each chunk
-        metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
-        # Create a Chroma vector store
-        embeddings = OpenAIEmbeddings()
-        docsearch = await cl.make_async(Chroma.from_texts)(
-            texts, embeddings, metadatas=metadatas
-        )
-    # The rest of your setup, like creating the chain, goes here
-    # This part is unchanged from your second snippet
     message_history = ChatMessageHistory()
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key="answer",
@@ -119,6 +112,7 @@ async def on_chat_start():
         return_messages=True,
     )
     chain = ConversationalRetrievalChain.from_llm(
         ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
         chain_type="stuff",
@@ -128,9 +122,9 @@ async def on_chat_start():
     )
     # Let the user know that the system is ready
-    msg.content = f"Processing `{file.name}` done. You can now ask questions!"
-    await msg.update()
     cl.user_session.set("chain", chain)

 chain_type_kwargs = {"prompt": prompt}
+def process_file(file_path: str):
+    # Example using PyPDF2 to extract text from a PDF file
+    from PyPDF2 import PdfReader
+    reader = PdfReader(file_path)
+    texts = []
+    for page in reader.pages:
+        texts.append(page.extract_text())
     return texts
 @cl.on_chat_start
 async def on_chat_start():
+    file = None
+    # Prompt users to upload either a text or PDF file
+    while file is None:
         files = await cl.AskFileMessage(
             content="Please upload a text or PDF file to begin!",
+            accept=["text/plain", "application/pdf"],  # This line is for UI guidance
+            max_size_mb=20,
             timeout=180,
         ).send()
+        if files:
+            file = files[0]  # Assuming the user uploads one file at a time
+    filename = file.name
+    # Initialize an empty list for texts, which will be populated based on the file type
     texts = []
+    # Process the file based on its extension
+    if filename.endswith('.txt'):
+        # Handle as text file
         with open(file.path, "r", encoding="utf-8") as f:
             text = f.read()
+        texts.append(text)
+        await cl.Message(content=f"`{filename}` uploaded, it contains {len(text)} characters!").send()
+    elif filename.endswith('.pdf'):
+        # Handle as PDF
+        texts = process_file(file.path)  # Adjust this call according to your PDF processing implementation
+    else:
+        await cl.Message(content="Unsupported file type uploaded. Please upload a text or PDF file.").send()
+        return  # Exit if the file type is not supported
+    # Process texts for conversational retrieval or other purposes here
+    # For demonstration, we'll just set up a simple Chroma vector store and conversational retrieval chain
+    # Create a Chroma vector store
+    embeddings = OpenAIEmbeddings()
+    docsearch = await cl.make_async(Chroma.from_texts)(
+        texts, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(texts))]
+    )
     message_history = ChatMessageHistory()
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key="answer",
         return_messages=True,
     )
+    # Set up the conversational retrieval chain
     chain = ConversationalRetrievalChain.from_llm(
         ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
         chain_type="stuff",
     )
     # Let the user know that the system is ready
+    await cl.Message(content=f"Your file `{filename}` is now ready for questions!").send()
+    # Save the chain in the user session for later use
     cl.user_session.set("chain", chain)