tbaig1605 commited on
Commit
21894ea
Β·
verified Β·
1 Parent(s): 1b33bfc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import docx
3
+ import numpy as np
4
+ import streamlit as st
5
+ from sentence_transformers import SentenceTransformer
6
+ from transformers import AutoTokenizer
7
+ import faiss
8
+ from groq import Groq
9
+
10
+ # ==========================================================
11
+ # GROQ API KEY (use HF Secrets)
12
+ # ==========================================================
13
+ os.environ["GROQ_API_KEY"] = os.getenv("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")
14
+
15
+ # ==========================================================
16
+ # STREAMLIT UI
17
+ # ==========================================================
18
+ st.set_page_config(page_title="Word RAG App", layout="wide")
19
+ st.title("πŸ“„ Word Document RAG")
20
+
21
+ uploaded_file = st.file_uploader(
22
+ "Upload a Word document",
23
+ type=["docx"]
24
+ )
25
+
26
+ # ==========================================================
27
+ # WORD TEXT EXTRACTION (UNCHANGED LOGIC)
28
+ # ==========================================================
29
+ def read_word(doc_path):
30
+ doc = docx.Document(doc_path)
31
+ text = "\n\n".join([p.text for p in doc.paragraphs if p.text.strip() != ""])
32
+ return [{"page": 1, "text": text}]
33
+
34
+ # ==========================================================
35
+ # CORE RAG FUNCTIONS (UNCHANGED)
36
+ # ==========================================================
37
+ def chunk_text(pages, chunk_size=800):
38
+ chunks = []
39
+ for page in pages:
40
+ paragraphs = page["text"].split("\n\n")
41
+ buffer = ""
42
+ for para in paragraphs:
43
+ if len(buffer) + len(para) <= chunk_size:
44
+ buffer += " " + para
45
+ else:
46
+ chunks.append({"page": page["page"], "text": buffer.strip()})
47
+ buffer = para
48
+ if buffer:
49
+ chunks.append({"page": page["page"], "text": buffer.strip()})
50
+ return chunks
51
+
52
+ def tokenize_chunks(chunks, model_name="sentence-transformers/all-mpnet-base-v2"):
53
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
54
+ return [tokenizer(c["text"], truncation=True)["input_ids"] for c in chunks]
55
+
56
+ def create_embeddings(chunks, model_name="allenai/specter"):
57
+ embedder = SentenceTransformer(model_name)
58
+ texts = [c["text"] for c in chunks]
59
+ embeddings = embedder.encode(texts, show_progress_bar=False)
60
+ return embedder, np.array(embeddings)
61
+
62
+ def store_embeddings(embeddings):
63
+ faiss.normalize_L2(embeddings)
64
+ dim = embeddings.shape[1]
65
+ index = faiss.IndexFlatIP(dim)
66
+ index.add(embeddings)
67
+ return index
68
+
69
+ def retrieve_chunks(query, embedder, index, chunks, top_k=None):
70
+ if not top_k:
71
+ top_k = min(20, len(chunks))
72
+ query_vec = embedder.encode([query])
73
+ faiss.normalize_L2(query_vec)
74
+ scores, indices = index.search(query_vec, top_k)
75
+ return [chunks[i] for i in indices[0]]
76
+
77
+ def build_safe_context(retrieved_chunks, max_chars=12000):
78
+ context = ""
79
+ used = 0
80
+ for c in retrieved_chunks[:3]:
81
+ block = f"(Page {c['page']}) {c['text']}\n\n"
82
+ context += block
83
+ used += len(block)
84
+ for c in retrieved_chunks[3:]:
85
+ block = f"(Page {c['page']}) {c['text']}\n\n"
86
+ if used + len(block) > max_chars:
87
+ break
88
+ context += block
89
+ used += len(block)
90
+ return context
91
+
92
+ def generate_answer(query, context):
93
+ client = Groq()
94
+ prompt = f"""
95
+ You are a document-based assistant.
96
+ Use the context to answer the question clearly.
97
+ If the answer is partially available, summarize it.
98
+ If the answer is not present, you may say 'Not found in the document'.
99
+
100
+ Context:
101
+ {context}
102
+
103
+ Question:
104
+ {query}
105
+ """
106
+ response = client.chat.completions.create(
107
+ model="llama-3.1-8b-instant",
108
+ messages=[{"role": "user", "content": prompt}],
109
+ temperature=0.3
110
+ )
111
+ return response.choices[0].message.content
112
+
113
+ # ==========================================================
114
+ # APP LOGIC
115
+ # ==========================================================
116
+ if uploaded_file:
117
+ with st.spinner("πŸ“„ Reading Word document..."):
118
+ file_name = uploaded_file.name
119
+ with open(file_name, "wb") as f:
120
+ f.write(uploaded_file.getbuffer())
121
+
122
+ pages = read_word(file_name)
123
+
124
+ with st.spinner("βœ‚οΈ Chunking & embedding document..."):
125
+ chunks = chunk_text(pages)
126
+ tokenize_chunks(chunks)
127
+ embedder, embeddings = create_embeddings(chunks)
128
+ index = store_embeddings(embeddings)
129
+
130
+ st.success("βœ… Document indexed successfully")
131
+
132
+ query = st.text_input("❓ Ask a question")
133
+
134
+ if query:
135
+ with st.spinner("πŸ€– Generating answer..."):
136
+ retrieved_chunks = retrieve_chunks(query, embedder, index, chunks)
137
+ context = build_safe_context(retrieved_chunks)
138
+ answer = generate_answer(query, context)
139
+
140
+ st.markdown("### βœ… Answer")
141
+ st.write(answer)