Samizie commited on
Commit
a5610f9
·
verified ·
1 Parent(s): 032f072

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -1
app.py CHANGED
@@ -3,7 +3,132 @@ import asyncio
3
  import numpy as np
4
 
5
  # Assume these functions exist in your scraper module
6
- from .module import process_urls, store_embeddings, embed_text, query_llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Streamlit UI
9
  st.title("Web Scraper & AI Query Interface")
 
3
  import numpy as np
4
 
5
  # Assume these functions exist in your scraper module
6
+ import asyncio
7
+ import requests
8
+ import pandas as pd
9
+ import re
10
+ import numpy as np
11
+ import faiss
12
+ from langchain_community.document_loaders import AsyncChromiumLoader
13
+ from langchain_community.document_transformers import Html2TextTransformer
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain_ollama import OllamaLLM
16
+ #from langchain_ollama import OllamaEmbeddings
17
+ from langchain_groq import ChatGroq
18
+ from itertools import chain
19
+ from sentence_transformers import SentenceTransformer
20
+ from langchain_community.vectorstores import FAISS
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+
23
+ # Scraping and Embedding Function
24
+ async def process_urls(urls):
25
+ # Load multiple URLs asynchronously
26
+ loader = AsyncChromiumLoader(urls)
27
+ docs = await loader.aload()
28
+
29
+ # Transform HTML to text
30
+ text_transformer = Html2TextTransformer()
31
+ transformed_docs = text_transformer.transform_documents(docs)
32
+
33
+ # Split the text into chunks and retain metadata
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
35
+ split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
36
+ #split_docs = text_splitter.split_documents(transformed_docs)
37
+ split_docs = list(chain.from_iterable(split_docs_nested))
38
+ # Attach the source URL to each split document
39
+ for doc in split_docs:
40
+ doc.metadata["source_url"] = doc.metadata.get("source", "Unknown") # Ensure URL metadata exists
41
+
42
+ return split_docs
43
+
44
+ def clean_text(text):
45
+ """Remove unnecessary whitespace, line breaks, and special characters."""
46
+ text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
47
+ text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove bracketed text (e.g., [advert])
48
+ return text
49
+
50
+
51
+ def embed_text(text_list):
52
+ embeddings = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
53
+ #return embeddings.encode(text_list)
54
+ if embeddings is None or len(embeddings) == 0:
55
+ raise ValueError("Embedding function returned an empty result.")
56
+ return embeddings.encode(text_list)
57
+
58
+
59
+ def store_embeddings(docs):
60
+ """Convert text into embeddings and store them in FAISS."""
61
+ #all_text = [clean_text(doc.page_content) for doc in docs if doc.page_content]
62
+ all_text = [clean_text(doc.page_content) for doc in docs if hasattr(doc, "page_content")]
63
+ text_sources = [doc.metadata["source_url"] for doc in docs]
64
+
65
+ embeddings = embed_text(all_text)
66
+ if embeddings is None or embeddings.size == 0:
67
+ raise ValueError("Embedding function returned None or empty list.")
68
+
69
+ embeddings = np.array(embeddings, dtype=np.float32)
70
+ # Normalize embeddings for better FAISS similarity search
71
+ faiss.normalize_L2(embeddings)
72
+ d = embeddings.shape[1]
73
+ index = faiss.IndexFlatIP(d) # Inner Product (cosine similarity)
74
+ index.add(embeddings)
75
+
76
+ return index, all_text, text_sources
77
+
78
+ def search_faiss(index, query_embedding, text_data, text_sources, top_k=5, min_score=0.5):
79
+ #query_embedding = np.array([query_embedding], dtype=np.float32)
80
+ query_embedding = query_embedding.reshape(1, -1)
81
+ faiss.normalize_L2(query_embedding) # Normalize query embedding for similarity
82
+
83
+ distances, indices = index.search(query_embedding, top_k)
84
+
85
+ results = []
86
+ if indices.size > 0:
87
+ for i in range(len(indices[0])):
88
+ if distances[0][i] >= min_score: # Ignore irrelevant results
89
+ idx = indices[0][i]
90
+ if idx < len(text_data):
91
+ results.append({"source": text_sources[idx], "content": text_data[idx]})
92
+
93
+ return results
94
+
95
+ def query_llm(index, text_data, text_sources, query):
96
+ groq_api="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"
97
+ chat = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
98
+
99
+ # Embed the query
100
+ query_embedding = embed_text([query])[0]
101
+
102
+ # Search FAISS for relevant documents
103
+ relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
104
+ print(type(relevant_docs))
105
+ print(relevant_docs)
106
+
107
+ # If no relevant docs, return a default message
108
+ if not relevant_docs:
109
+ return "No relevant information found."
110
+
111
+ # Query LLM with retrieved content
112
+ responses = []
113
+ for doc in relevant_docs:
114
+ if isinstance(doc, dict) and "source" in doc and "content" in doc:
115
+ source_url = doc["source"]
116
+ content = doc["content"][:10000]
117
+ else:
118
+ print(f"Unexpected doc format: {doc}") # Debugging print
119
+ continue
120
+
121
+ prompt = f"""
122
+ Based on the following content, answer the question: "{query}"
123
+ Content (from {source_url}):
124
+ {content}
125
+ "
126
+ """
127
+ response = chat.invoke(prompt)
128
+ #print(type(response))
129
+ responses.append({"source": source_url, "response": response})
130
+
131
+ return responses
132
 
133
  # Streamlit UI
134
  st.title("Web Scraper & AI Query Interface")