Spaces:

tareeb23
/

Document_Search_Engine

Sleeping

App Files Files Community

tareeb23 commited on Jul 25, 2024

Commit

1b5c287

verified ·

1 Parent(s): 806829b

Document Search Engine

Browse files

Files changed (1) hide show

app.py +74 -49

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ from transformers import pipeline
 import re
 from collections import Counter
 import string
 @st.cache_resource
 def load_qa_pipeline():
@@ -21,68 +23,91 @@ def normalize_answer(s):
         return text.lower()
     return white_space_fix(remove_articles(remove_punc(lower(s))))
-def compute_exact_match(prediction, ground_truth):
-    return int(normalize_answer(prediction) == normalize_answer(ground_truth))
-def compute_f1(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
 def main():
-    st.title("Question Answering with RoBERTa")
     # Load the QA pipeline
     qa_pipeline = load_qa_pipeline()
-    # User input for context
-    context = st.text_area("Enter the context:", height=200)
-    context = context.strip()  # Remove leading/trailing whitespace
-    # User input for question
-    question = st.text_input("Enter your question:")
-    question = question.strip()  # Remove leading/trailing whitespace
-    # Check for context and question length
-    if len(context) > 1500:
-        st.warning("Context should not exceed 1500 characters.")
-        return
-    if len(question) > 150:
-        st.warning("Question should not exceed 150 characters.")
-        return
-    # Option to calculate scores
-    calculate_scores = st.checkbox("Calculate scores")
-    if calculate_scores:
-        actual_answer = st.text_input("Enter the actual answer:")
-    if st.button("Get Answer"):
         if context and question:
-            # Get the answer
-            result = qa_pipeline(question=question, context=context)
-            # Display the result
-            st.subheader("Answer:")
-            st.write(result['answer'])
-            st.write(f"Confidence: {result['score']:.2f}")
-            # Calculate and display scores if option is selected
-            if calculate_scores and actual_answer:
-                em_score = compute_exact_match(result['answer'], actual_answer)
-                f1_score = compute_f1(result['answer'], actual_answer)
-                st.subheader("Scores:")
-                st.write(f"Exact Match: {em_score}")
-                st.write(f"F1 Score: {f1_score:.4f}")
         else:
-            st.warning("Please provide both context and question.")
 if __name__ == "__main__":
     main()

 import re
 from collections import Counter
 import string
+import docx2txt
+from io import BytesIO
 @st.cache_resource
 def load_qa_pipeline():
         return text.lower()
     return white_space_fix(remove_articles(remove_punc(lower(s))))
+def chunk_text(text, chunk_size=1000):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_size:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def highlight_text(text, start_indices, chunk_size):
+    highlighted_text = text
+    offset = 0
+    for i, start in enumerate(start_indices):
+        actual_start = start + (i * 7)  # 7 is the length of the highlight tag
+        chunk_index = start // chunk_size
+        actual_start += chunk_index * chunk_size
+        highlighted_text = (
+            highlighted_text[:actual_start + offset] +
+            "<mark>" +
+            highlighted_text[actual_start + offset:actual_start + offset + 10] +
+            "</mark>" +
+            highlighted_text[actual_start + offset + 10:]
+        )
+        offset += 13  # Length of "<mark></mark>"
+    return highlighted_text
 def main():
+    st.title("Document Search Engine")
     # Load the QA pipeline
     qa_pipeline = load_qa_pipeline()
+    # File upload for Word documents
+    uploaded_file = st.file_uploader("Upload a Word document", type=['docx'])
+    if uploaded_file is not None:
+        doc_text = docx2txt.process(BytesIO(uploaded_file.read()))
+        st.session_state['context'] = doc_text
+    # Context input
+    if 'context' not in st.session_state:
+        st.session_state['context'] = ""
+    context = st.text_area("Enter or edit the context:", value=st.session_state['context'], height=300)
+    st.session_state['context'] = context
+    # Search input and button
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        question = st.text_input("Enter your search query:")
+    with col2:
+        search_button = st.button("Search")
+    if search_button:
         if context and question:
+            chunks = chunk_text(context)
+            results = []
+            for i, chunk in enumerate(chunks):
+                result = qa_pipeline(question=question, context=chunk)
+                result['chunk_index'] = i
+                results.append(result)
+            # Sort results by score and get top 3
+            top_results = sorted(results, key=lambda x: x['score'], reverse=True)[:3]
+            st.subheader("Top 3 Results:")
+            for i, result in enumerate(top_results, 1):
+                st.write(f"{i}. Answer: {result['answer']}")
+                st.write(f"   Confidence: {result['score']:.2f}")
+            # Highlight answers in the context
+            chunk_size = 1000  # Make sure this matches the chunk_size in chunk_text function
+            start_indices = [result['start'] + (result['chunk_index'] * chunk_size) for result in top_results]
+            highlighted_context = highlight_text(context, start_indices, chunk_size)
+            st.subheader("Context with Highlighted Answers:")
+            st.markdown(highlighted_context, unsafe_allow_html=True)
         else:
+            st.warning("Please provide both context and search query.")
 if __name__ == "__main__":
     main()