Spaces:

dnzblgn
/

Fastener_Agent

Sleeping

App Files Files Community

dnzblgn commited on Feb 21, 2025

Commit

a87cdfe

verified ·

1 Parent(s): a1e454e

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -82

app.py CHANGED Viewed

@@ -16,9 +16,6 @@ from torchvision import transforms
 from torchvision.models import resnet50, ResNet50_Weights
 from torchvision import transforms, models
 class GeometryImageClassifier:
     def __init__(self):
         # Load ResNet50 but only use it for feature extraction
@@ -101,9 +98,6 @@ class GeometryImageClassifier:
 # ✅ Use a strong sentence embedding model
 semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
 def extract_text_from_docx(file_path):
     """ ✅ Extracts normal text & tables from a .docx file for better retrieval. """
     doc = docx.Document(file_path)
@@ -125,20 +119,14 @@ def extract_text_from_docx(file_path):
     return "\n".join(extracted_text)
 def load_documents():
     """ ✅ Loads & processes documents, ensuring table data is properly extracted. """
     file_paths = {
         "Fastener_Types_Manual": "Fastener_Types_Manual.docx",
         "Manufacturing_Expert_Manual": "Manufacturing Expert Manual.docx"
     }
     all_splits = []
     for doc_name, file_path in file_paths.items():
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Document not found: {file_path}")
@@ -161,118 +149,90 @@ def load_documents():
     return all_splits
 def create_db(splits):
     """ ✅ Creates a FAISS vector database from document splits. """
     embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     vectordb = FAISS.from_documents(splits, embeddings)
     return vectordb
 def retrieve_documents(query, retriever, embeddings):
-    """ ✅ Retrieves the most relevant documents & filters out low-relevance ones. """
-    query_embedding = np.array(embeddings.embed_query(query)).reshape(1, -1)
     results = retriever.invoke(query)
     if not results:
         return []
-    doc_embeddings = np.array([embeddings.embed_query(doc.page_content) for doc in results])
-    similarity_scores = cosine_similarity(query_embedding, doc_embeddings)[0]  # ✅ Proper cosine similarity
-    MIN_SIMILARITY = 0.5  # 🔥 Increased threshold to improve relevance
-    filtered_results = [(doc, sim) for doc, sim in zip(results, similarity_scores) if sim >= MIN_SIMILARITY]
-    # ✅ Debugging log
-    print(f"🔍 Query: {query}")
-    print(f"📄 Retrieved Docs (before filtering): {[(doc.metadata.get('source', 'Unknown'), sim) for doc, sim in zip(results, similarity_scores)]}")
-    print(f"✅ Filtered Docs (after threshold {MIN_SIMILARITY}): {[(doc.metadata.get('source', 'Unknown'), sim) for doc, sim in filtered_results]}")
     return [doc for doc, _ in filtered_results] if filtered_results else []
 def validate_query_semantically(query, retrieved_docs):
-    """ ✅ Ensures the query meaning is covered in the retrieved documents. """
     if not retrieved_docs:
         return False
     combined_text = " ".join([doc.page_content for doc in retrieved_docs])
     query_embedding = semantic_model.encode(query, normalize_embeddings=True)
     doc_embedding = semantic_model.encode(combined_text, normalize_embeddings=True)
-    similarity_score = np.dot(query_embedding, doc_embedding)  # ✅ Cosine similarity already normalized
-    print(f"🔍 Semantic Similarity Score: {similarity_score}")
-    return similarity_score >= 0.3  # 🔥 Stricter threshold to ensure correctness
 def handle_query(query, history, retriever, qa_chain, embeddings):
     """ ✅ Handles user queries & prevents hallucination. """
     retrieved_docs = retrieve_documents(query, retriever, embeddings)
     if not retrieved_docs or not validate_query_semantically(query, retrieved_docs):
         return history + [(query, "I couldn't find any relevant information.")], ""
     response = qa_chain.invoke({"question": query, "chat_history": history})
     assistant_response = response['answer'].strip()
-    # ✅ Final hallucination check
     if not validate_query_semantically(query, retrieved_docs):
         assistant_response = "I couldn't find any relevant information."
     assistant_response += f"\n\n📄 **Source:** {', '.join(set(doc.metadata.get('source', 'Unknown') for doc in retrieved_docs))}"
-    # ✅ Debugging logs
     print(f"🤖 LLM Response: {assistant_response[:300]}")  # ✅ Limit output for debugging
     history.append((query, assistant_response))
     return history, ""
 def initialize_chatbot(vector_db):
     """ ✅ Initializes chatbot with improved retrieval & processing. """
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')
     embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     retriever = vector_db.as_retriever(search_kwargs={"k": 5, "search_type": "similarity"})
     system_prompt = """You are an AI assistant that answers questions **ONLY based on the provided documents**.
 - **If no relevant documents are retrieved, respond with: "I couldn't find any relevant information."**
 - **If the meaning of the query does not match the retrieved documents, say "I couldn't find any relevant information."**
 - **Do NOT attempt to answer from general knowledge.**
 """
     llm = HuggingFaceEndpoint(
         repo_id="tiiuae/falcon-40b-instruct",
         huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
@@ -281,15 +241,12 @@ def initialize_chatbot(vector_db):
         task="text-generation",
         system_prompt=system_prompt)
     qa_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=retriever,
         memory=memory,
         return_source_documents=True,
         verbose=False)
     return retriever, qa_chain, embeddings

 from torchvision.models import resnet50, ResNet50_Weights
 from torchvision import transforms, models
 class GeometryImageClassifier:
     def __init__(self):
         # Load ResNet50 but only use it for feature extraction
 # ✅ Use a strong sentence embedding model
 semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
 def extract_text_from_docx(file_path):
     """ ✅ Extracts normal text & tables from a .docx file for better retrieval. """
     doc = docx.Document(file_path)
     return "\n".join(extracted_text)
 def load_documents():
     """ ✅ Loads & processes documents, ensuring table data is properly extracted. """
     file_paths = {
         "Fastener_Types_Manual": "Fastener_Types_Manual.docx",
         "Manufacturing_Expert_Manual": "Manufacturing Expert Manual.docx"
     }
     all_splits = []
     for doc_name, file_path in file_paths.items():
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Document not found: {file_path}")
     return all_splits
 def create_db(splits):
     """ ✅ Creates a FAISS vector database from document splits. """
     embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     vectordb = FAISS.from_documents(splits, embeddings)
     return vectordb
 def retrieve_documents(query, retriever, embeddings):
+    print("\n=== Document Retrieval Process ===")
+    print(f"Query: {query}")
     results = retriever.invoke(query)
+    print(f"Initial results count: {len(results)}")
     if not results:
+        print("No initial results found")
         return []
+    reranked_results = rerank_documents(query, results, top_k=3)
+    print(f"Reranked results count: {len(reranked_results)}")
+    filtered_chunks = filter_relevant_chunks(query, reranked_results, embeddings, threshold=0.7)
+    print(f"Filtered chunks count: {len(filtered_chunks)}")
+    if not filtered_chunks:
+        print("No chunks passed filtering")
+        return []
+    doc_embeddings = np.array([embeddings.embed_query(doc.page_content) for doc in filtered_chunks])
+    query_embedding = np.array(embeddings.embed_query(query)).reshape(1, -1)
+    similarity_scores = cosine_similarity(query_embedding, doc_embeddings)[0]
+    print("\nSimilarity Scores:")
+    for doc, score in zip(filtered_chunks, similarity_scores):
+        print(f"Score: {score:.4f} | Source: {doc.metadata.get('source', 'Unknown')}")
+        print(f"Content Preview: {doc.page_content[:100]}...\n")
+    MIN_SIMILARITY = 0.5
+    filtered_results = [(doc, sim) for doc, sim in zip(filtered_chunks, similarity_scores) if sim >= MIN_SIMILARITY]
+    print(f"Final filtered results count: {len(filtered_results)}")
     return [doc for doc, _ in filtered_results] if filtered_results else []
 def validate_query_semantically(query, retrieved_docs):
+    print("\n=== Semantic Validation ===")
     if not retrieved_docs:
+        print("No documents to validate")
         return False
     combined_text = " ".join([doc.page_content for doc in retrieved_docs])
     query_embedding = semantic_model.encode(query, normalize_embeddings=True)
     doc_embedding = semantic_model.encode(combined_text, normalize_embeddings=True)
+    similarity_score = np.dot(query_embedding, doc_embedding)
+    print(f"Query: {query}")
+    print(f"Semantic similarity score: {similarity_score:.4f}")
+    print(f"Validation {'passed' if similarity_score >= 0.3 else 'failed'}")
+    return similarity_score >= 0.3
 def handle_query(query, history, retriever, qa_chain, embeddings):
     """ ✅ Handles user queries & prevents hallucination. """
     retrieved_docs = retrieve_documents(query, retriever, embeddings)
     if not retrieved_docs or not validate_query_semantically(query, retrieved_docs):
         return history + [(query, "I couldn't find any relevant information.")], ""
     response = qa_chain.invoke({"question": query, "chat_history": history})
     assistant_response = response['answer'].strip()
     if not validate_query_semantically(query, retrieved_docs):
         assistant_response = "I couldn't find any relevant information."
     assistant_response += f"\n\n📄 **Source:** {', '.join(set(doc.metadata.get('source', 'Unknown') for doc in retrieved_docs))}"
     print(f"🤖 LLM Response: {assistant_response[:300]}")  # ✅ Limit output for debugging
     history.append((query, assistant_response))
     return history, ""
 def initialize_chatbot(vector_db):
     """ ✅ Initializes chatbot with improved retrieval & processing. """
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')
     embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     retriever = vector_db.as_retriever(search_kwargs={"k": 5, "search_type": "similarity"})
     system_prompt = """You are an AI assistant that answers questions **ONLY based on the provided documents**.
 - **If no relevant documents are retrieved, respond with: "I couldn't find any relevant information."**
 - **If the meaning of the query does not match the retrieved documents, say "I couldn't find any relevant information."**
 - **Do NOT attempt to answer from general knowledge.**
 """
     llm = HuggingFaceEndpoint(
         repo_id="tiiuae/falcon-40b-instruct",
         huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
         task="text-generation",
         system_prompt=system_prompt)
     qa_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=retriever,
         memory=memory,
         return_source_documents=True,
         verbose=False)
     return retriever, qa_chain, embeddings