File size: 5,912 Bytes
74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 58159ec 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 58159ec 74b3e59 d9576ee 58159ec 74b3e59 d9576ee 74b3e59 d9576ee 58159ec d9576ee 58159ec d9576ee 74b3e59 d9576ee 74b3e59 58159ec d9576ee 74b3e59 58159ec 74b3e59 d9576ee 74b3e59 58159ec 74b3e59 58159ec d9576ee 58159ec d9576ee 74b3e59 58159ec 74b3e59 d9576ee 74b3e59 d9576ee 74b3e59 58159ec 74b3e59 58159ec d9576ee 74b3e59 d9576ee 58159ec d9576ee 58159ec 74b3e59 58159ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# ============================================
# Civil Engineering RAG (ASTM) - Hugging Face Version
# ============================================
import os
import fitz # PyMuPDF
import faiss
import numpy as np
import gradio as gr
import tempfile
from typing import List
from groq import Groq
from sentence_transformers import SentenceTransformer
# --------------------------
# π API Key
# --------------------------
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
if not GROQ_API_KEY:
raise RuntimeError("β Missing GROQ_API_KEY. Please add it in Hugging Face β Settings β Secrets.")
# Initialize Groq client and embedding model
client = Groq(api_key=GROQ_API_KEY)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
INDEX, CORPUS = None, []
# --------------------------
# π Safe PDF Text Extraction
# --------------------------
def extract_text_from_pdf(file_path: str) -> str:
try:
text = ""
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text("text")
return text
except Exception as e:
return f"Error extracting text from {file_path}: {e}"
# --------------------------
# βοΈ Chunking Function
# --------------------------
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
chunks = []
start, n = 0, len(text)
while start < n:
end = min(start + chunk_size, n)
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap
if start < 0:
start = 0
return chunks
# --------------------------
# π’ Build FAISS Index
# --------------------------
def build_faiss_index(paths: List[str]):
texts, vectors = [], []
for p in paths:
text = extract_text_from_pdf(p)
if text.startswith("Error extracting text"):
raise RuntimeError(text)
chunks = chunk_text(text)
if not chunks:
continue
embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
texts.extend(chunks)
vectors.append(embs.astype("float32"))
if not texts:
raise RuntimeError("β No valid text extracted from PDFs.")
vectors = np.vstack(vectors).astype("float32")
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)
return index, texts
# --------------------------
# π€ Rebuild Index from Upload
# --------------------------
def rebuild_index_from_upload(files):
if not files:
return "β οΈ Please upload at least one PDF."
paths = []
for f in files:
try:
# Gradio provides a temp file path automatically (f.name)
if hasattr(f, "name") and os.path.exists(f.name):
temp_path = f.name
else:
# fallback in rare case
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(f.read())
temp_path = tmp.name
paths.append(temp_path)
except Exception as e:
return f"β Error while saving uploaded file: {e}"
try:
global INDEX, CORPUS
INDEX, CORPUS = build_faiss_index(paths)
return f"β
Successfully indexed {len(paths)} PDF(s). You can now ask questions!"
except Exception as e:
return f"β Error while building index: {e}"
# --------------------------
# π Retrieve Context
# --------------------------
def retrieve_context(query: str, top_k: int = 4) -> str:
if INDEX is None:
return "β οΈ Please upload and index PDFs first."
q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
distances, indices = INDEX.search(q_emb, top_k)
selected = [CORPUS[i] for i in indices[0] if 0 <= i < len(CORPUS)]
return "\n\n---\n\n".join(selected)
# --------------------------
# π§ Query via Groq LLM
# --------------------------
SYSTEM_PROMPT = (
"You are a helpful Civil Engineering assistant. "
"Use ONLY the provided ASTM or uploaded document context to answer. "
"If the answer isn't in context, say you cannot find it."
)
def ask_groq(query: str, top_k: int = 4, model: str = "llama-3.3-70b-versatile") -> str:
if INDEX is None:
return "β οΈ Please upload PDFs first."
context = retrieve_context(query, top_k)
if not context.strip():
return "β οΈ No relevant information found in the uploaded PDFs."
prompt = f"""{SYSTEM_PROMPT}
Context:
{context}
Question:
{query}
"""
try:
completion = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.2,
)
return completion.choices[0].message.content
except Exception as e:
return f"β LLM Error: {e}"
# --------------------------
# π¨ Gradio UI
# --------------------------
def ui_ask(query: str, top_k: int):
try:
return ask_groq(query, top_k=top_k)
except Exception as e:
return f"β Error: {e}"
with gr.Blocks(title="Civil Engineering RAG (ASTM)") as demo:
gr.Markdown("## ποΈ Civil Engineering RAG\nUpload ASTM or civil-engineering PDFs, build an index, and ask questions.")
with gr.Row():
uploader = gr.File(label="π Upload PDFs", file_count="multiple", file_types=[".pdf"])
status = gr.Textbox(label="Status", interactive=False)
uploader.upload(rebuild_index_from_upload, uploader, status)
gr.Markdown("---")
inp = gr.Textbox(label="Your Question", placeholder="e.g., What is the curing time for concrete as per ASTM?")
k = gr.Slider(1, 10, value=4, step=1, label="Top-K passages")
out = gr.Textbox(label="Answer")
btn = gr.Button("Ask")
btn.click(ui_ask, inputs=[inp, k], outputs=[out])
if __name__ == "__main__":
demo.launch()
|