File size: 6,267 Bytes
20feecd
 
 
 
 
 
9cdb8d1
20feecd
a14ce03
bb772b0
20feecd
 
 
 
 
3137489
20feecd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb772b0
20feecd
 
 
 
 
 
 
 
 
 
3137489
20feecd
 
 
 
a14ce03
20feecd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a14ce03
20feecd
 
3137489
20feecd
3137489
20feecd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import sys
import logging
import gradio as gr
import requests
from pinecone import Pinecone, ServerlessSpec
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack import Pipeline
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.utils import Secret

# --- Logging ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# --- Environment Variables ---
api_key = os.getenv("PINECONE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("Please set the PINECONE_API_KEY as an environment variable.")
if not openai_api_key:
    raise ValueError("Please set the OPENAI_API_KEY as an environment variable.")
os.environ["OPENAI_API_KEY"] = openai_api_key

# --- Pinecone Setup ---
index_name = "quickstart"
dimension = 1536
pc = Pinecone(api_key=api_key)

# Create index if not exists
if index_name not in [idx['name'] for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# --- Document Loading and Processing ---
os.makedirs("data/paul_graham", exist_ok=True)
file_path = "data/paul_graham/paul_graham_essay.txt"
if not os.path.exists(file_path):
    url = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"
    r = requests.get(url)
    with open(file_path, "w") as f:
        f.write(r.text)

# --- Haystack Pipeline for Indexing ---
document_store = PineconeDocumentStore(api_key=Secret.from_env_var("PINECONE_API_KEY"), index=index_name)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", TextFileToDocument())
indexing_pipeline.add_component("splitter", DocumentSplitter(split_by="word", split_length=100))
indexing_pipeline.add_component("embedder", OpenAIDocumentEmbedder())
indexing_pipeline.add_component("writer", DocumentWriter(document_store))

indexing_pipeline.connect("converter.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")

if document_store.count_documents() == 0:
    logging.info("Indexing the document...")
    indexing_pipeline.run({"converter": {"sources": [file_path]}})
    logging.info("Indexing complete.")

# --- Haystack Query Pipeline ---
template = """
Given the following context, answer the user's question.
If the context isn't sufficient, say that you don't have enough information.

Context:
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}

Question: {{ query }}
"""
query_pipeline = Pipeline()
query_pipeline.add_component("embedder", OpenAITextEmbedder())
query_pipeline.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store))
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
query_pipeline.add_component("llm", OpenAIGenerator(api_key=Secret.from_env_var("OPENAI_API_KEY")))

query_pipeline.connect("embedder.embedding", "retriever.query_embedding") # Corrected connection
query_pipeline.connect("retriever.documents", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")

# --- Query Function ---
def ask_question(prompt):
    try:
        results = query_pipeline.run({"embedder": {"text": prompt}, "prompt_builder": {"query": prompt}})
        response = results["llm"]["replies"][0]
        return str(response)
    except Exception as e:
        return f"❌ Error: {str(e)}"

# --- Gradio UI ---
with gr.Blocks(css="""body {    background-color: #f5f5dc;    font-family: 'Georgia', 'Merriweather', serif;}h1, h2, h3 {    color: #4e342e;}.gr-box, .gr-column, .gr-group {    border-radius: 15px;    padding: 20px;    background-color: #fffaf0;    box-shadow: 2px 4px 14px rgba(0, 0, 0, 0.1);    margin-top: 10px;}textarea, input[type="text"] {    background-color: #fffaf0;    border: 1px solid #d2b48c;    color: #4e342e;    border-radius: 8px;}button {    background-color: #a1887f;    color: white;    font-weight: bold;    border-radius: 8px;    transition: background-color 0.3s ease;}button:hover {    background-color: #8d6e63;}.gr-button {    border-radius: 8px !important;}""") as demo:
    with gr.Column():
        gr.Markdown("""
        <div style='text-align: center;'>
            <h1>🧠 Paul Graham Essay Q&A</h1>
            <div style='font-size: 1.1em; color: #6d4c41; margin-bottom: 1em;'>
                Explore insights from Paul Graham's essay using semantic search powered by <strong>Haystack</strong> + <strong>Pinecone</strong>.
            </div>
        </div>
        """)
        with gr.Accordion("ℹ️ What is Pinecone Vector Indexing?", open=False):
            gr.Markdown("""**Pinecone** is a vector database that stores document embeddings (numeric representations of meaning). When you ask a question, it's converted into a vector and compared against stored vectors to find the most relevant answers — even if they don't match word-for-word.""")
        gr.Markdown("### 📖 Ask your question below:")
        with gr.Group():
            with gr.Row():
                user_input = gr.Textbox(
                    placeholder="E.g., What does Paul Graham say about startups?",
                    label="Your Question",
                    lines=2
                )
            with gr.Row():
                output = gr.Textbox(label="Answer", lines=6)
            with gr.Row():
                submit_btn = gr.Button("🔍 Search Essay")
                clear_btn = gr.Button("🧹 Clear")
            submit_btn.click(fn=ask_question, inputs=user_input, outputs=output)
            clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[user_input, output])
demo.launch()