File size: 4,847 Bytes
02c5e78
0612daf
02c5e78
 
 
 
 
 
 
6f39035
02c5e78
 
6f39035
 
7a1c90d
02c5e78
6f39035
02c5e78
6f39035
 
c69af1e
 
02c5e78
 
 
6f39035
02c5e78
5c8f5eb
 
 
 
 
 
 
 
7a1c90d
6f39035
02c5e78
 
5c8f5eb
02c5e78
 
9cafd43
5c8f5eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f39035
 
 
5c8f5eb
02c5e78
5c8f5eb
 
 
 
 
 
6f39035
5c8f5eb
02c5e78
 
 
 
 
6f39035
02c5e78
6f39035
fd67371
02c5e78
6f39035
 
 
 
 
 
02c5e78
 
 
 
 
 
 
 
fd67371
02c5e78
 
 
 
 
 
7a1c90d
 
02c5e78
 
fd67371
02c5e78
 
 
 
 
 
 
 
 
6f39035
7a1c90d
d949de5
02c5e78
7a1c90d
02c5e78
d949de5
02c5e78
7a1c90d
02c5e78
6f39035
 
 
 
02c5e78
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import pickle
import gradio as gr
from transformers import pipeline
from langchain_classic.chains import RetrievalQAWithSourcesChain
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_google_genai import ChatGoogleGenerativeAI


# Get HuggingFace API token from environment variables
token = os.environ.get("API_TOKEN")

# ------------------------
# LLM 
# ------------------------
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.7,api_key = token,
    max_token =100
)

# Global variable to store the QA chain
chain  = None

# Paths to save FAISS and URLs
# ------------------------
FAISS_FILE = "vectorstore.pkl"
URLS_FILE = "urls.pkl"

# ------------------------
# Function to process URLs with logging and FAISS management
# ------------------------
def process_urls_with_logs(url1, url2, url3):
    global chain 
    
    urls = [url1, url2, url3]
    urls = [u.strip() for u in urls if u.strip() != ""]
    
    if len(urls) == 0:
        return "Please provide at least one URL."

    # Check if FAISS and saved URLs exist
    if os.path.exists(FAISS_FILE) and os.path.exists(URLS_FILE):
        with open(URLS_FILE, "rb") as f:
            saved_urls = pickle.load(f)
    else:
        saved_urls = []

    # If there are new URLs, recreate FAISS
    if set(urls) != set(saved_urls):
        print("New URLs detected or FAISS does not exist. Recreating FAISS...")

        # Remove old FAISS from memory to free RAM
        if 'vectorstore' in globals():
            del globals()['vectorstore']

        print("Loading URLs...")
        loader = UnstructuredURLLoader(urls=urls)
        documents = loader.load()

        print("Splitting documents into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
        splits = text_splitter.split_documents(documents)

        print("Creating embeddings...")
        embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")

        print("Creating vector database (FAISS)...")
        vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

        # Save FAISS and URLs to pickle
        with open(FAISS_FILE, "wb") as f:
            pickle.dump(vectorstore, f)
            
        with open(URLS_FILE, "wb") as f:
            pickle.dump(urls, f)

        print("Initializing LLM chain...")
        
        chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
        
        return "FAISS successfully created/recreated!"
    
    else:
        print("No new URLs. Using existing FAISS.")
        # Load existing FAISS
        with open(FAISS_FILE, "rb") as f:
            vectorstore = pickle.load(f)

        chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
        return "Existing FAISS loaded."

# ------------------------
# Function to answer questions
# ------------------------
def ask_question(question):
    global chain 

    if chain  is None:
        return "Please process URLs first."
    
    result = chain.invoke({'question': question})

    answer = result.get("answer", "")
    sources = result.get("sources", "")
        
    return answer, sources

# ------------------------
# Gradio Interface
# ------------------------
with gr.Blocks() as app:
    with gr.Row():
        # Sidebar: URL input and processing
        with gr.Column(scale=1):
            gr.Markdown("## Insert URLs")
            
            url1 = gr.Textbox(label="URL 1")
            url2 = gr.Textbox(label="URL 2")
            url3 = gr.Textbox(label="URL 3")
            
            process_btn = gr.Button("Process URLs")
            status_output = gr.Textbox(label="Status", lines=8)
        
        # Main Area: Question input and answer output
        with gr.Column(scale=2):
            gr.Markdown("## Write your question")
            
            question_box = gr.Textbox(
                label="Your Question",
                placeholder="Type your question based on the URLs...",
                lines=4
            )
            
            ask_btn = gr.Button("Ask")
            answer_output = gr.Textbox(label="Answer", lines=8)
            sources_output = gr.Textbox(label="Sources", lines=4)
    
    # Connect buttons to suas funções
    process_btn.click(
        process_urls_with_logs,
        inputs=[url1, url2, url3],
        outputs=status_output
    )
    
    ask_btn.click(
    ask_question,
    inputs=question_box,
    outputs=[answer_output, sources_output]
)

# Launch the Gradio app
app.launch()