Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pdfplumber | |
| import docx | |
| import os | |
| import pytesseract | |
| from PIL import Image | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import torch | |
| from huggingface_hub import login | |
| import spaces | |
| # π Login to Hugging Face | |
| login(token=os.environ.get("token")) | |
| # β GPU Check | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("β GPU not detected! Please enable GPU in Space settings.") | |
| print(f"β Using GPU: {torch.cuda.get_device_name(0)}") | |
| # π§ Model Setup | |
| model_id = "mistralai/Mistral-7B-Instruct-v0.2" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token")) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| token=os.environ.get("token"), | |
| trust_remote_code=True | |
| ) | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| # π Text Extractors | |
| def extract_text_from_pdf(file): | |
| text = "" | |
| with pdfplumber.open(file) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| else: | |
| img = page.to_image(resolution=300).original | |
| ocr_text = pytesseract.image_to_string(img) | |
| text += ocr_text + "\n" | |
| return text | |
| def extract_text_from_docx(file): | |
| doc = docx.Document(file) | |
| return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) | |
| def chunk_text(text, max_chars=6000): | |
| paragraphs = text.split("\n") | |
| chunks, current_chunk = [], "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < max_chars: | |
| current_chunk += para + "\n" | |
| else: | |
| chunks.append(current_chunk) | |
| current_chunk = para + "\n" | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| return chunks | |
| # βοΈ Prompt Generator | |
| def create_prompt(text_chunk): | |
| return f""" | |
| You are an expert in analyzing U.S. government tender documents. Based on the content provided, answer the following 20 questions in Q&A format (no explanations, no repeated questions): | |
| just answers in this format | |
| Q1: ... | |
| A1: ... | |
| ... | |
| Q20: ... | |
| A20: ... | |
| these are the questions to be answered | |
| 1. What is the general scope of the tender? | |
| 2. Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required? | |
| 3. Is there a Set-aside status (e.g., 8a, SDVOSB)? | |
| 4. Are U.S. citizens or security-cleared staff required? | |
| 5. What is the expected team size or key qualifications? | |
| 6. Are offshore resources allowed? | |
| 7. What is the mode of working (On-site/Remote/Hybrid)? | |
| 8. Is presence in specific regions/states required? | |
| 9. Is the delivery location defined? | |
| 10. Is remote or offshore delivery allowed? | |
| 11. Is a U.S. office presence required? | |
| 12. Are travel/lodging expenses reimbursable? | |
| 13. Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required? | |
| 14. Are background checks or security clearance needed? | |
| 15. Is past experience required? | |
| 16. How many references are required? | |
| 17. Are only U.S. references accepted? | |
| 18. Is private sector experience allowed? | |
| 19. Do references need to be identified? | |
| 20. Is subcontracting permitted? | |
| CONTENT: | |
| {text_chunk} | |
| """ | |
| # π§Ή Cleaner to remove repeated prompt text | |
| def clean_output(raw_output): | |
| # Find where Q1 starts | |
| start_index = raw_output.find("Q1:") | |
| if start_index == -1: | |
| return raw_output.strip() | |
| qna = raw_output[start_index:] | |
| # Truncate after A20 if present | |
| if "A20:" in qna: | |
| end_index = qna.find("A20:") | |
| end_line = qna[end_index:].split("\n")[0] | |
| return qna[:end_index + len(end_line)].strip() | |
| return qna.strip() | |
| # π Main Analyzer | |
| def analyze_document(file, cancel_flag): | |
| ext = os.path.splitext(file.name)[-1].lower() | |
| if ext == ".pdf": | |
| raw_text = extract_text_from_pdf(file) | |
| elif ext == ".docx": | |
| raw_text = extract_text_from_docx(file) | |
| else: | |
| return "β Unsupported file format", "β Invalid format" | |
| if not raw_text.strip(): | |
| return "β No text found in document", "β Empty" | |
| chunks = chunk_text(raw_text) | |
| full_summary = "" | |
| for i, chunk in enumerate(chunks): | |
| if cancel_flag: | |
| return "β Cancelled", "β" | |
| prompt = create_prompt(chunk) | |
| result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"] | |
| cleaned = clean_output(result) | |
| full_summary += cleaned + "\n\n---\n\n" | |
| return full_summary.strip(), "β Completed" | |
| # π Interface | |
| with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo: | |
| gr.Markdown("## π US Tender Analyzer β Structured Q&A from Tenders") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File(label="π Upload Tender Document (PDF/DOCX)") | |
| with gr.Row(): | |
| analyze_button = gr.Button("π Analyze", variant="primary") | |
| terminate_button = gr.Button("β Cancel", variant="stop") | |
| status_box = gr.Textbox(label="π Status", value="β³ Waiting...", interactive=False) | |
| with gr.Column(scale=2): | |
| output_box = gr.Textbox(label="π§ Extracted Tender Intelligence", lines=30, interactive=False) | |
| cancel_flag = gr.State(False) | |
| analyze_button.click( | |
| fn=analyze_document, | |
| inputs=[file_input, cancel_flag], | |
| outputs=[output_box, status_box] | |
| ) | |
| terminate_button.click( | |
| fn=lambda: gr.update(value=True), | |
| inputs=[], | |
| outputs=[cancel_flag] | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) | |