File size: 5,912 Bytes
74b3e59
d9576ee
74b3e59
 
d9576ee
74b3e59
 
 
d9576ee
74b3e59
 
 
 
 
d9576ee
74b3e59
d9576ee
74b3e59
d9576ee
74b3e59
d9576ee
74b3e59
d9576ee
58159ec
 
74b3e59
 
d9576ee
74b3e59
 
d9576ee
 
 
 
 
 
 
 
74b3e59
 
d9576ee
74b3e59
 
d9576ee
 
74b3e59
 
 
 
 
 
 
 
 
 
 
d9576ee
74b3e59
 
58159ec
74b3e59
d9576ee
 
 
 
 
 
58159ec
74b3e59
 
d9576ee
 
 
 
74b3e59
 
 
 
 
d9576ee
 
 
58159ec
 
 
d9576ee
58159ec
 
d9576ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74b3e59
 
d9576ee
74b3e59
 
58159ec
d9576ee
74b3e59
 
58159ec
74b3e59
 
d9576ee
 
 
74b3e59
 
58159ec
 
74b3e59
 
 
58159ec
 
d9576ee
58159ec
d9576ee
 
 
74b3e59
 
58159ec
74b3e59
 
 
 
 
d9576ee
 
 
 
 
 
 
 
 
 
74b3e59
 
d9576ee
74b3e59
 
 
 
 
58159ec
74b3e59
 
58159ec
d9576ee
74b3e59
d9576ee
58159ec
 
 
 
d9576ee
58159ec
74b3e59
 
 
 
 
58159ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# ============================================
# Civil Engineering RAG (ASTM) - Hugging Face Version
# ============================================
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
import gradio as gr
import tempfile
from typing import List
from groq import Groq
from sentence_transformers import SentenceTransformer

# --------------------------
# πŸ”‘ API Key
# --------------------------
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
if not GROQ_API_KEY:
    raise RuntimeError("❌ Missing GROQ_API_KEY. Please add it in Hugging Face β†’ Settings β†’ Secrets.")

# Initialize Groq client and embedding model
client = Groq(api_key=GROQ_API_KEY)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

INDEX, CORPUS = None, []

# --------------------------
# πŸ“„ Safe PDF Text Extraction
# --------------------------
def extract_text_from_pdf(file_path: str) -> str:
    try:
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text("text")
        return text
    except Exception as e:
        return f"Error extracting text from {file_path}: {e}"

# --------------------------
# βœ‚οΈ Chunking Function
# --------------------------
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
    chunks = []
    start, n = 0, len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap
        if start < 0:
            start = 0
    return chunks

# --------------------------
# πŸ”’ Build FAISS Index
# --------------------------
def build_faiss_index(paths: List[str]):
    texts, vectors = [], []
    for p in paths:
        text = extract_text_from_pdf(p)
        if text.startswith("Error extracting text"):
            raise RuntimeError(text)
        chunks = chunk_text(text)
        if not chunks:
            continue
        embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
        texts.extend(chunks)
        vectors.append(embs.astype("float32"))

    if not texts:
        raise RuntimeError("❌ No valid text extracted from PDFs.")

    vectors = np.vstack(vectors).astype("float32")
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    return index, texts

# --------------------------
# πŸ“€ Rebuild Index from Upload
# --------------------------
def rebuild_index_from_upload(files):
    if not files:
        return "⚠️ Please upload at least one PDF."

    paths = []
    for f in files:
        try:
            # Gradio provides a temp file path automatically (f.name)
            if hasattr(f, "name") and os.path.exists(f.name):
                temp_path = f.name
            else:
                # fallback in rare case
                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                    tmp.write(f.read())
                    temp_path = tmp.name
            paths.append(temp_path)
        except Exception as e:
            return f"❌ Error while saving uploaded file: {e}"

    try:
        global INDEX, CORPUS
        INDEX, CORPUS = build_faiss_index(paths)
        return f"βœ… Successfully indexed {len(paths)} PDF(s). You can now ask questions!"
    except Exception as e:
        return f"❌ Error while building index: {e}"

# --------------------------
# πŸ” Retrieve Context
# --------------------------
def retrieve_context(query: str, top_k: int = 4) -> str:
    if INDEX is None:
        return "⚠️ Please upload and index PDFs first."
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    distances, indices = INDEX.search(q_emb, top_k)
    selected = [CORPUS[i] for i in indices[0] if 0 <= i < len(CORPUS)]
    return "\n\n---\n\n".join(selected)

# --------------------------
# 🧠 Query via Groq LLM
# --------------------------
SYSTEM_PROMPT = (
    "You are a helpful Civil Engineering assistant. "
    "Use ONLY the provided ASTM or uploaded document context to answer. "
    "If the answer isn't in context, say you cannot find it."
)

def ask_groq(query: str, top_k: int = 4, model: str = "llama-3.3-70b-versatile") -> str:
    if INDEX is None:
        return "⚠️ Please upload PDFs first."

    context = retrieve_context(query, top_k)
    if not context.strip():
        return "⚠️ No relevant information found in the uploaded PDFs."

    prompt = f"""{SYSTEM_PROMPT}

Context:
{context}

Question:
{query}
"""

    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"❌ LLM Error: {e}"

# --------------------------
# 🎨 Gradio UI
# --------------------------
def ui_ask(query: str, top_k: int):
    try:
        return ask_groq(query, top_k=top_k)
    except Exception as e:
        return f"❌ Error: {e}"

with gr.Blocks(title="Civil Engineering RAG (ASTM)") as demo:
    gr.Markdown("## πŸ—οΈ Civil Engineering RAG\nUpload ASTM or civil-engineering PDFs, build an index, and ask questions.")
    
    with gr.Row():
        uploader = gr.File(label="πŸ“„ Upload PDFs", file_count="multiple", file_types=[".pdf"])
        status = gr.Textbox(label="Status", interactive=False)
    uploader.upload(rebuild_index_from_upload, uploader, status)

    gr.Markdown("---")
    inp = gr.Textbox(label="Your Question", placeholder="e.g., What is the curing time for concrete as per ASTM?")
    k = gr.Slider(1, 10, value=4, step=1, label="Top-K passages")
    out = gr.Textbox(label="Answer")
    btn = gr.Button("Ask")
    btn.click(ui_ask, inputs=[inp, k], outputs=[out])

if __name__ == "__main__":
    demo.launch()