sejalkishan commited on
Commit
de6872f
Β·
verified Β·
1 Parent(s): d8c1543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -31
app.py CHANGED
@@ -8,36 +8,19 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
8
  from huggingface_hub import login
9
  import torch
10
  import os
 
11
 
12
- # πŸ” Authenticate using Hugging Face token (if needed for gated repos)
13
- # If you're using public models, this can be commented out.
14
- if os.environ.get("HF_TOKEN"):
15
- login(token=os.environ["HF_TOKEN"])
16
 
17
- # πŸš€ Check if GPU is available
18
- if not torch.cuda.is_available():
19
- raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
20
-
21
- print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
22
 
23
- # πŸ”§ Load model
24
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
25
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
26
-
27
- model = AutoModelForCausalLM.from_pretrained(
28
- model_id,
29
- device_map="auto",
30
- torch_dtype=torch.float16,
31
- use_auth_token=True,
32
- trust_remote_code=True
33
- )
34
 
35
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
36
-
37
- # 🧠 Load EasyOCR
38
- reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
39
-
40
- # πŸ“„ Extract text from PDF with OCR fallback
41
  def extract_text_from_pdf(file):
42
  text = ""
43
  with pdfplumber.open(file) as pdf:
@@ -59,7 +42,7 @@ def extract_text_from_docx(file):
59
  doc = docx.Document(file)
60
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
61
 
62
- # 🧩 Chunk text for LLM processing
63
  def chunk_text(text, max_chars=6000):
64
  paragraphs = text.split("\n")
65
  chunks, current_chunk = [], ""
@@ -73,7 +56,7 @@ def chunk_text(text, max_chars=6000):
73
  chunks.append(current_chunk)
74
  return chunks
75
 
76
- # 🧠 LLM Prompt Template
77
  def create_prompt(text_chunk):
78
  return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
79
  1. Total manpower required
@@ -86,7 +69,8 @@ CONTENT:
86
  {text_chunk}
87
  """
88
 
89
- # πŸ” Main handler
 
90
  def analyze_document(file):
91
  filename = file.name
92
  ext = os.path.splitext(filename)[-1].lower()
@@ -101,8 +85,20 @@ def analyze_document(file):
101
  if len(raw_text.strip()) == 0:
102
  return "❌ No text found in the document."
103
 
104
- chunks = chunk_text(raw_text)
 
 
 
 
 
 
 
 
105
 
 
 
 
 
106
  full_summary = ""
107
  for chunk in chunks:
108
  prompt = create_prompt(chunk)
@@ -114,8 +110,8 @@ def analyze_document(file):
114
 
115
  # 🎨 Gradio Interface
116
  with gr.Blocks() as demo:
117
- gr.Markdown("## πŸ“„ Smart Document Analyzer – Tender & Technical Documents (GPU-Powered, OCR-Ready)")
118
- gr.Markdown("Upload a PDF or DOCX file. This tool extracts manpower, timeline, technical needs, and budget using a powerful LLM with OCR support for scanned PDFs.")
119
 
120
  with gr.Row():
121
  file_input = gr.File(label="πŸ“Ž Upload PDF or Word Document")
 
8
  from huggingface_hub import login
9
  import torch
10
  import os
11
+ import spaces
12
 
13
+ # πŸ” Authenticate if token is provided (for gated models)
14
+ if os.environ.get("token"):
15
+ login(token=os.environ["token"])
 
16
 
17
+ # 🧠 Load EasyOCR Reader once (outside GPU scope)
18
+ reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 
 
 
19
 
20
+ # πŸ”§ Static model ID
21
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 
 
 
 
 
 
 
 
 
22
 
23
+ # πŸ“„ Extract text from PDF (supports OCR fallback)
 
 
 
 
 
24
  def extract_text_from_pdf(file):
25
  text = ""
26
  with pdfplumber.open(file) as pdf:
 
42
  doc = docx.Document(file)
43
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
44
 
45
+ # 🧩 Chunk text into 6000-character parts
46
  def chunk_text(text, max_chars=6000):
47
  paragraphs = text.split("\n")
48
  chunks, current_chunk = [], ""
 
56
  chunks.append(current_chunk)
57
  return chunks
58
 
59
+ # 🧠 Prompt template
60
  def create_prompt(text_chunk):
61
  return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
62
  1. Total manpower required
 
69
  {text_chunk}
70
  """
71
 
72
+ # 🧠 GPU-decorated main function β€” forces GPU allocation during processing
73
+ @spaces.GPU(duration=300) # up to 10 minutes GPU time
74
  def analyze_document(file):
75
  filename = file.name
76
  ext = os.path.splitext(filename)[-1].lower()
 
85
  if len(raw_text.strip()) == 0:
86
  return "❌ No text found in the document."
87
 
88
+ # βœ… Load model and tokenizer INSIDE GPU scope
89
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
90
+ model = AutoModelForCausalLM.from_pretrained(
91
+ model_id,
92
+ device_map="auto",
93
+ torch_dtype=torch.float16,
94
+ use_auth_token=True,
95
+ trust_remote_code=True
96
+ )
97
 
98
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
99
+
100
+ # πŸ” Chunked generation
101
+ chunks = chunk_text(raw_text)
102
  full_summary = ""
103
  for chunk in chunks:
104
  prompt = create_prompt(chunk)
 
110
 
111
  # 🎨 Gradio Interface
112
  with gr.Blocks() as demo:
113
+ gr.Markdown("## πŸ“„ Smart Document Analyzer – Tender & Technical Docs (GPU + OCR Ready)")
114
+ gr.Markdown("Upload a PDF (scanned or normal) or Word file. Extract manpower, deadlines, tech needs, and budgets using LLM + OCR.")
115
 
116
  with gr.Row():
117
  file_input = gr.File(label="πŸ“Ž Upload PDF or Word Document")