Spaces:
Sleeping
Sleeping
File size: 12,495 Bytes
9ef7cd9 856c6b8 75b13ff 9ef7cd9 2b041e7 75b13ff 2b041e7 75b13ff 2b041e7 75b13ff 2b041e7 75b13ff 9ef7cd9 75b13ff 856c6b8 75b13ff 9ef7cd9 75b13ff 9ef7cd9 2b041e7 9ef7cd9 75b13ff 9ef7cd9 75b13ff 2b041e7 d555613 2b041e7 75b13ff 9ef7cd9 2b041e7 75b13ff 9ef7cd9 2b041e7 9ef7cd9 75b13ff 2b041e7 9ef7cd9 75b13ff 9ef7cd9 2b041e7 75b13ff 9ef7cd9 75b13ff 7d6c495 9ef7cd9 75b13ff 856c6b8 7d6c495 75b13ff 856c6b8 9ef7cd9 75b13ff 856c6b8 75b13ff 7d6c495 856c6b8 7d6c495 75b13ff 1783dc8 856c6b8 2b041e7 856c6b8 2b041e7 856c6b8 9ef7cd9 2b041e7 856c6b8 2b041e7 856c6b8 2b041e7 856c6b8 2b041e7 856c6b8 2b041e7 856c6b8 2b041e7 856c6b8 75b13ff 9ef7cd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
# app.py
from flask import Flask, request, render_template, send_file, redirect, url_for, jsonify
import os
import re
import uuid
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
# Optional NLI (not required for this feature)
try:
from transformers import pipeline as hf_pipeline
nli = hf_pipeline("text-classification", model="microsoft/deberta-large-mnli")
print("β
NLI pipeline loaded.")
except Exception as e:
nli = None
print("βΉοΈ NLI pipeline not loaded (optional):", e)
print("β³ Loading SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("β
Encoder loaded.")
app = Flask(__name__)
# ββ base folders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BASE_DIR = os.path.dirname(__file__)
BASE_UPLOADS = os.path.join(BASE_DIR, "uploads")
BASE_RESULTS = os.path.join(BASE_DIR, "results")
os.makedirs(BASE_UPLOADS, exist_ok=True)
os.makedirs(BASE_RESULTS, exist_ok=True)
# ββ clear uploads at launch ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def clear_uploads_folder():
"""Remove all files and subfolders inside the uploads folder on app launch."""
for entry in os.listdir(BASE_UPLOADS):
path = os.path.join(BASE_UPLOADS, entry)
if os.path.isdir(path):
for root, dirs, files in os.walk(path, topdown=False):
for fname in files:
os.remove(os.path.join(root, fname))
for dname in dirs:
os.rmdir(os.path.join(root, dname))
os.rmdir(path)
else:
os.remove(path)
clear_uploads_folder()
print("β
Uploads folder cleared.")
# runtime cache keyed by search-id:
# (paragraphs_norm, embeddings, faiss-index, spans, para_file_idx, file_meta)
# spans[i] = (start_char, end_char) of paragraph i within merged.txt
# para_file_idx[i] = index into file_meta for paragraph i
# file_meta[j] = {"name": filename, "start": start_char, "end": end_char, "second_line": str}
index_data = {}
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_paths(sid: str):
"""Return per-search folders & files, creating them if needed."""
up_folder = os.path.join(BASE_UPLOADS, sid)
res_folder = os.path.join(BASE_RESULTS, sid)
os.makedirs(up_folder, exist_ok=True)
os.makedirs(res_folder, exist_ok=True)
merged_file = os.path.join(res_folder, "merged.txt")
result_file = os.path.join(res_folder, "results.txt")
return up_folder, res_folder, merged_file, result_file
def compute_second_line(raw_text: str) -> str:
"""Return the 2nd non-empty line if available, else the literal 2nd line, else ''."""
lines = raw_text.splitlines()
non_empty = [ln.strip() for ln in lines if ln.strip() != ""]
if len(non_empty) >= 2:
return non_empty[1]
if len(lines) >= 2:
return lines[1].strip()
return ""
def extract_for_merge_and_second(file_path: str):
"""
Return a tuple (merged_text_piece, second_line_str) for a single file.
- For .txt: merged part is raw file text.
- For .pdf: merged part is lightly cleaned text; second line is computed
using raw extracted line structure as well.
"""
if file_path.lower().endswith(".txt"):
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
raw = f.read()
second = compute_second_line(raw)
return raw, second
if file_path.lower().endswith(".pdf"):
reader = PdfReader(file_path)
pages = []
for page in reader.pages:
t = page.extract_text() or ""
pages.append(t)
raw_lines_joined = "\n".join(pages) # preserve some line structure for 2nd line
second = compute_second_line(raw_lines_joined)
# Light cleanup for the merged view
full_text = " ".join(pages)
full_text = re.sub(r'(?<=[.!?])\s{2,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text)
return full_text, second
return "", ""
def split_paragraphs_with_spans(merged_text: str):
"""
Split merged_text into logical 'paragraphs' based on blank lines,
returning normalized paragraphs for embedding AND exact spans (start,end)
in the original merged_text for highlighting/jumping.
"""
sep = re.compile(r'(?:\r?\n)+', flags=re.MULTILINE)
paras_norm = []
spans = []
pos = 0
for m in sep.finditer(merged_text):
seg = merged_text[pos:m.start()]
norm = re.sub(r'\s+', ' ', seg).strip()
if len(norm.split()) > 4: # keep only substantive chunks
paras_norm.append(norm)
spans.append((pos, m.start()))
pos = m.end()
# Tail
seg = merged_text[pos:]
norm = re.sub(r'\s+', ' ', seg).strip()
if len(norm.split()) > 4:
paras_norm.append(norm)
spans.append((pos, len(merged_text)))
return paras_norm, spans
def rebuild_merged_and_index(sid: str):
"""Build merged.txt, paragraph embeddings, spans, and per-paragraph file mapping."""
up_folder, res_folder, merged_file, _ = get_paths(sid)
merged_text = ""
file_meta = [] # list of dicts with name, start, end, second_line
# Append files in sorted order for stability
for filename in sorted(os.listdir(up_folder)):
if not filename.lower().endswith((".pdf", ".txt")):
continue
file_path = os.path.join(up_folder, filename)
part, second = extract_for_merge_and_second(file_path)
part = part.rstrip()
if not part:
continue
start = len(merged_text)
merged_text += part + "\n\n" # add separator after each file
end = start + len(part) # char range covering the file's text (exclude our extra \n\n)
file_meta.append({"name": filename, "start": start, "end": end, "second_line": second})
with open(merged_file, "w", encoding='utf-8') as f:
f.write(merged_text)
paras_norm, spans = split_paragraphs_with_spans(merged_text)
if not paras_norm:
index_data[sid] = ([], None, None, [], [], [])
return
# Map each paragraph span to its originating file via start position
para_file_idx = []
for (pstart, _pend) in spans:
assigned = None
for j, meta in enumerate(file_meta):
next_start = file_meta[j+1]["start"] if j + 1 < len(file_meta) else float("inf")
if pstart >= meta["start"] and pstart < next_start:
assigned = j
break
if assigned is None:
assigned = max(0, len(file_meta)-1)
para_file_idx.append(assigned)
# Build embeddings + FAISS
embed = model.encode(paras_norm, batch_size=32, show_progress_bar=False)
embed = np.asarray(embed)
if embed.ndim == 1:
embed = embed[np.newaxis, :]
faiss.normalize_L2(embed)
idx = faiss.IndexFlatIP(embed.shape[1])
idx.add(embed)
index_data[sid] = (paras_norm, embed, idx, spans, para_file_idx, file_meta)
# ββ routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@app.route("/", methods=["GET", "POST"])
def index():
sid = request.args.get("sid") or request.form.get("sid")
if not sid:
sid = str(uuid.uuid4())
up_folder, _, _, _ = get_paths(sid)
paragraphs, embeddings, index_faiss, spans, para_file_idx, file_meta = index_data.get(
sid, ([], None, None, [], [], [])
)
uploaded_filenames = sorted(os.listdir(up_folder))
results = []
query = ""
k = 5
if request.method == "POST":
query = request.form.get("query", "").strip()
try:
k = int(request.form.get("topk", 5))
except ValueError:
k = 5
if paragraphs and query:
q_embed = model.encode([query])
q_embed = np.asarray(q_embed)
if q_embed.ndim == 1:
q_embed = q_embed[np.newaxis, :]
faiss.normalize_L2(q_embed)
D, I = index_faiss.search(q_embed, k=min(k, len(paragraphs)))
# Build result objects with file name + the file's 2nd line
for i in I[0]:
i = int(i)
file_idx = para_file_idx[i] if 0 <= i < len(para_file_idx) else -1
fname = file_meta[file_idx]["name"] if 0 <= file_idx < len(file_meta) else "unknown"
second = file_meta[file_idx]["second_line"] if 0 <= file_idx < len(file_meta) else ""
results.append({
"idx": i,
"text": paragraphs[i],
"file": fname,
"second_line": second
})
_, res_folder, _, result_file = get_paths(sid)
with open(result_file, "w", encoding='utf-8') as f:
for r in results:
f.write(r["text"] + "\n\n")
return render_template(
"index.html",
results=results,
query=query,
topk=k,
sid=sid,
uploaded_filenames=uploaded_filenames
)
@app.route("/upload", methods=["POST"])
def upload_file():
sid = request.args.get("sid")
if not sid:
return ("Missing sid", 400)
up_folder, _, _, _ = get_paths(sid)
uploaded_files = request.files.getlist("file")
for file in uploaded_files:
if file and file.filename.lower().endswith((".pdf", ".txt")):
file.save(os.path.join(up_folder, file.filename))
rebuild_merged_and_index(sid)
return ("", 204)
@app.route("/download")
def download():
sid = request.args.get("sid")
if not sid:
return ("Missing sid", 400)
_, _, _, result_file = get_paths(sid)
if not os.path.exists(result_file):
return ("Nothing to download", 404)
return send_file(result_file, as_attachment=True)
@app.route("/download_merged")
def download_merged():
sid = request.args.get("sid")
if not sid:
return ("Missing sid", 400)
_, _, merged_file, _ = get_paths(sid)
if not os.path.exists(merged_file):
return ("Nothing to download", 404)
return send_file(merged_file, as_attachment=True)
@app.route("/reset")
def reset():
sid = request.args.get("sid")
if not sid:
return redirect(url_for('index'))
up_folder, res_folder, _, _ = get_paths(sid)
for folder in [up_folder, res_folder]:
if os.path.exists(folder):
for f in os.listdir(folder):
os.remove(os.path.join(folder, f))
index_data.pop(sid, None) # drop cached embeddings
return redirect(url_for('index'))
@app.route("/api/ping")
def ping():
return "pong", 200
@app.route("/api/context")
def api_context():
"""
Return FULL merged text plus the exact character span for the requested paragraph.
Query params: sid, idx (int)
Response: { merged: str, start: int, end: int, total_len: int }
"""
sid = request.args.get("sid")
if not sid:
return jsonify(error="Missing sid"), 400
try:
idx = int(request.args.get("idx", "-1"))
except (TypeError, ValueError):
return jsonify(error="Bad idx"), 400
paragraphs, _, _, spans, _, _ = index_data.get(sid, (None, None, None, None, None, None))
if paragraphs is None or spans is None:
return jsonify(error="No index for this sid. Upload files first."), 404
if not (0 <= idx < len(spans)):
return jsonify(error="idx out of range"), 400
_, _, merged_file, _ = get_paths(sid)
if not os.path.exists(merged_file):
return jsonify(error="merged.txt not found"), 404
with open(merged_file, "r", encoding="utf-8") as f:
merged_text = f.read()
start, end = spans[idx]
return jsonify(
merged=merged_text,
start=start,
end=end,
total_len=len(merged_text)
)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |