Spaces:

sidbhasin
/

PDF_Answer_AI_By_Syncmerce

Sleeping

App Files Files Community

sidbhasin commited on Nov 12, 2024

Commit

1be21c8

verified ·

1 Parent(s): 6fce530

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -105

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import pdfplumber
 import torch
 from PyPDF2 import PdfReader
 import re
-import textwrap
 # Set page config
 st.set_page_config(
@@ -13,74 +12,71 @@ st.set_page_config(
     layout="wide"
 )
-# Custom CSS with improved styling
 st.markdown("""
     <style>
-    .stApp {
-        max-width: 1200px;
-        margin: 0 auto;
     }
-    .chat-message {
-        padding: 1.5rem;
-        border-radius: 0.5rem;
-        margin-bottom: 1rem;
         display: flex;
         flex-direction: column;
-        color: #ffffff;
     }
-    .chat-message.user {
         background-color: #2b313e;
     }
-    .chat-message.assistant {
-        background-color: #475063;
     }
-    .chat-message .content {
-        display: flex;
-        margin-bottom: 0.5rem;
-        padding: 1rem;
-        border-radius: 0.5rem;
     }
-    .chat-message .metadata {
-        font-size: 0.85rem;
-        color: #a8a8a8;
-        margin-top: 0.5rem;
     }
     .chat-input {
         position: fixed;
         bottom: 0;
         left: 0;
         right: 0;
-        padding: 1rem;
-        background-color: #262730;
-    }
-    .source-info {
-        font-size: 0.8rem;
-        color: #666;
-        margin-top: 0.5rem;
-        padding: 0.5rem;
-        background-color: #f0f2f6;
-        border-radius: 0.3rem;
     }
     </style>
 """, unsafe_allow_html=True)
-# Initialize session state
-if 'messages' not in st.session_state:
-    st.session_state.messages = []
-if 'text_data' not in st.session_state:
-    st.session_state.text_data = None
 @st.cache_resource
-def load_model():
     return pipeline(
         "question-answering",
         model="deepset/roberta-base-squad2",
         tokenizer="deepset/roberta-base-squad2"
     )
-def extract_text_with_metadata(pdf_file):
     text_data = []
     with pdfplumber.open(pdf_file) as pdf:
         for page_num, page in enumerate(pdf.pages, 1):
             text = page.extract_text()
@@ -88,46 +84,58 @@ def extract_text_with_metadata(pdf_file):
                 paragraphs = text.split('\n\n')
                 for para_num, paragraph in enumerate(paragraphs, 1):
                     if paragraph.strip():
-                        lines = paragraph.split('\n')
-                        for line_num, line in enumerate(lines, 1):
-                            text_data.append({
-                                'text': line.strip(),
-                                'page': page_num,
-                                'paragraph': para_num,
-                                'line': line_num,
-                                'full_paragraph': paragraph.strip()
-                            })
     return text_data
-def find_answer(question, text_data, qa_model):
-    full_text = ' '.join([item['text'] for item in text_data])
-    try:
-        result = qa_model(question=question, context=full_text)
-        answer_text = result['answer']
-        answer_score = result['score']
-        # Find the source paragraph
-        for item in text_data:
-            if answer_text in item['text']:
-                return {
-                    'answer': answer_text,
-                    'confidence': answer_score,
-                    'page': item['page'],
-                    'paragraph': item['paragraph'],
-                    'line': item['line'],
-                    'context': item['full_paragraph']
                 }
-    except Exception as e:
-        st.error(f"Error processing question: {str(e)}")
-        return None
 def main():
-    st.title("📚 PDF AI Chat")
     try:
-        qa_model = load_model()
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
         return
@@ -135,67 +143,91 @@ def main():
     # File upload
     pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
-    if pdf_file and not st.session_state.text_data:
         with st.spinner("Processing PDF..."):
             try:
-                st.session_state.text_data = extract_text_with_metadata(pdf_file)
-                st.success("PDF processed successfully!")
             except Exception as e:
                 st.error(f"Error processing PDF: {str(e)}")
                 return
-    # Display chat messages
     for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.write(message["content"])
-            if "metadata" in message:
-                st.markdown(f"""
                     <div class="source-info">
-                        Source: Page {message['metadata']['page']},
-                        Paragraph {message['metadata']['paragraph']},
-                        Line {message['metadata']['line']}
-                        <br>Confidence: {message['metadata']['confidence']:.2%}
                     </div>
-                """, unsafe_allow_html=True)
-    # Chat input
-    if st.session_state.text_data:
-        if question := st.chat_input("Ask a question about the document"):
-            # Add user message
             st.session_state.messages.append({"role": "user", "content": question})
             # Generate answer
             with st.spinner("Finding answer..."):
-                result = find_answer(question, st.session_state.text_data, qa_model)
-                if result:
-                    # Add assistant message
                     st.session_state.messages.append({
                         "role": "assistant",
-                        "content": result['answer'],
                         "metadata": {
-                            "page": result['page'],
-                            "paragraph": result['paragraph'],
-                            "line": result['line'],
-                            "confidence": result['confidence']
                         }
                     })
-                    # Rerun to update chat display
                     st.rerun()
     else:
         st.markdown("""
             ### Instructions:
             1. Upload a PDF document using the file uploader above
             2. Wait for the document to be processed
-            3. Start asking questions about the document
-            4. Get detailed answers with source information
             ### Features:
-            - Chat-like interface
-            - Source tracking
             - Confidence scores
-            - Context preservation
         """)
 if __name__ == "__main__":

 import torch
 from PyPDF2 import PdfReader
 import re
 # Set page config
 st.set_page_config(
     layout="wide"
 )
+# Custom CSS for better styling
 st.markdown("""
     <style>
+    .chat-container {
+        display: flex;
+        flex-direction: column;
+        gap: 20px;
+        padding: 20px;
+        height: calc(100vh - 200px);
+        overflow-y: auto;
     }
+    .message-container {
         display: flex;
         flex-direction: column;
+        gap: 10px;
+        padding: 15px;
+        border-radius: 10px;
+        max-width: 90%;
     }
+    .user-message {
         background-color: #2b313e;
+        color: white;
+        align-self: flex-end;
     }
+    .assistant-message {
+        background-color: #f0f2f6;
+        color: black;
+        align-self: flex-start;
     }
+    .source-info {
+        font-size: 0.8em;
+        color: #666;
+        border-top: 1px solid #ddd;
+        margin-top: 10px;
+        padding-top: 10px;
     }
+    .context-box {
+        background-color: #f8f9fa;
+        border-left: 3px solid #1f77b4;
+        padding: 10px;
+        margin-top: 10px;
+        font-size: 0.9em;
     }
     .chat-input {
         position: fixed;
         bottom: 0;
         left: 0;
         right: 0;
+        padding: 20px;
+        background: white;
+        border-top: 1px solid #ddd;
     }
     </style>
 """, unsafe_allow_html=True)
 @st.cache_resource
+def load_qa_model():
     return pipeline(
         "question-answering",
         model="deepset/roberta-base-squad2",
         tokenizer="deepset/roberta-base-squad2"
     )
+def process_pdf(pdf_file):
     text_data = []
     with pdfplumber.open(pdf_file) as pdf:
         for page_num, page in enumerate(pdf.pages, 1):
             text = page.extract_text()
                 paragraphs = text.split('\n\n')
                 for para_num, paragraph in enumerate(paragraphs, 1):
                     if paragraph.strip():
+                        text_data.append({
+                            'text': paragraph.strip(),
+                            'page': page_num,
+                            'paragraph': para_num,
+                            'context': paragraph.strip()
+                        })
     return text_data
+def find_best_answer(question, text_data, qa_model):
+    best_answer = None
+    max_score = 0
+    relevant_context = []
+    for chunk in text_data:
+        try:
+            result = qa_model(
+                question=question,
+                context=chunk['text'],
+                max_answer_len=100
+            )
+            if result['score'] > max_score:
+                max_score = result['score']
+                best_answer = {
+                    'answer': result['answer'],
+                    'confidence': result['score'],
+                    'page': chunk['page'],
+                    'paragraph': chunk['paragraph'],
+                    'context': chunk['context']
                 }
+            # Collect relevant contexts
+            if result['score'] > 0.1:  # Threshold for relevance
+                relevant_context.append(chunk['context'])
+        except Exception as e:
+            continue
+    return best_answer, relevant_context[:3]  # Return top 3 relevant contexts
 def main():
+    st.title("📚 Advanced PDF Question Answering")
+    # Initialize session state
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    if 'pdf_data' not in st.session_state:
+        st.session_state.pdf_data = None
+    # Load QA model
     try:
+        qa_model = load_qa_model()
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
         return
     # File upload
     pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
+    if pdf_file and not st.session_state.pdf_data:
         with st.spinner("Processing PDF..."):
             try:
+                st.session_state.pdf_data = process_pdf(pdf_file)
+                st.success("PDF processed successfully! You can now ask questions.")
             except Exception as e:
                 st.error(f"Error processing PDF: {str(e)}")
                 return
+    # Chat interface
+    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
+    # Display chat history
     for message in st.session_state.messages:
+        if message["role"] == "user":
+            st.markdown(f"""
+                <div class="message-container user-message">
+                    {message["content"]}
+                </div>
+            """, unsafe_allow_html=True)
+        else:
+            st.markdown(f"""
+                <div class="message-container assistant-message">
+                    <div>{message["content"]}</div>
                     <div class="source-info">
+                        Source: Page {message["metadata"]["page"]},
+                        Paragraph {message["metadata"]["paragraph"]}
+                        (Confidence: {message["metadata"]["confidence"]:.1%})
+                    </div>
+                    <div class="context-box">
+                        {message["metadata"]["context"]}
                     </div>
+                </div>
+            """, unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
+    # Question input
+    if st.session_state.pdf_data:
+        question = st.text_input("Ask a question about the document:", key="question_input")
+        if question:
+            # Add user question to chat history
             st.session_state.messages.append({"role": "user", "content": question})
             # Generate answer
             with st.spinner("Finding answer..."):
+                answer, relevant_contexts = find_best_answer(
+                    question,
+                    st.session_state.pdf_data,
+                    qa_model
+                )
+                if answer:
+                    # Add assistant response to chat history
                     st.session_state.messages.append({
                         "role": "assistant",
+                        "content": answer["answer"],
                         "metadata": {
+                            "page": answer["page"],
+                            "paragraph": answer["paragraph"],
+                            "confidence": answer["confidence"],
+                            "context": answer["context"]
                         }
                     })
+                    # Force refresh
                     st.rerun()
+                else:
+                    st.error("Sorry, I couldn't find a relevant answer in the document.")
     else:
         st.markdown("""
             ### Instructions:
             1. Upload a PDF document using the file uploader above
             2. Wait for the document to be processed
+            3. Start asking questions about the content
+            4. Get detailed answers with source information and context
             ### Features:
+            - Natural conversation interface
+            - Source tracking with page numbers
             - Confidence scores
+            - Relevant context display
+            - Multiple question support
         """)
 if __name__ == "__main__":