Add new SentenceTransformer model
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +666 -0
- chat_template.jinja +85 -0
- config.json +63 -0
- config_sentence_transformers.json +14 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- tokenizer.json +3 -0
- tokenizer_config.json +14 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": false,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": true,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,666 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- sentence-transformers
|
| 4 |
+
- sentence-similarity
|
| 5 |
+
- feature-extraction
|
| 6 |
+
- dense
|
| 7 |
+
- generated_from_trainer
|
| 8 |
+
- dataset_size:49346
|
| 9 |
+
- loss:MultipleNegativesRankingLoss
|
| 10 |
+
base_model: Qwen/Qwen3-Embedding-0.6B
|
| 11 |
+
widget:
|
| 12 |
+
- source_sentence: 'How can I prevent forwarding a manipulated email?
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
How can I prevent someone from modifying the contents of an email they received
|
| 16 |
+
and then forwarding it to others? Some employees cheat managers by changing the
|
| 17 |
+
content of emails and forwarding the modified email to them. I need a policy that prevents
|
| 18 |
+
this backdoor.'
|
| 19 |
+
sentences:
|
| 20 |
+
- 'They''re identical
|
| 21 |
+
|
| 22 |
+
They both implement the same algorithm, so it''s not like one can be faster than
|
| 23 |
+
the other. Use whichever tool is available on whichever platform you use.
|
| 24 |
+
|
| 25 |
+
In Windows one uses certUtil as
|
| 26 |
+
|
| 27 |
+
certUtil -hashfile <PATH_TO_FILE> <HASH_ALGORITHM>
|
| 28 |
+
|
| 29 |
+
and, available hash algorithms are MD2 MD4 MD5 SHA1 SHA256 SHA384 SHA512. These
|
| 30 |
+
are different hash algorithms with different output sizes and they provide different
|
| 31 |
+
security/insecurity levels. One should not use MD2, MD4, MD5, or SHA-1 as long
|
| 32 |
+
as they really know what they are doing.
|
| 33 |
+
|
| 34 |
+
Be aware of encoding, even some of the online hashings are not directly compatible,
|
| 35 |
+
as we can see in StackOverflow some questions are about the interoperability
|
| 36 |
+
of the sites and libraries.
|
| 37 |
+
|
| 38 |
+
And never use online hashing for your secret/private files.'
|
| 39 |
+
- 'Direct Network Flood Detection across IaaS, Linux, Windows, and macOS
|
| 40 |
+
|
| 41 |
+
Windows
|
| 42 |
+
|
| 43 |
+
High-volume packet generation by local processes (e.g., PowerShell, cmd, curl.exe)
|
| 44 |
+
or network service processes resulting in excessive outbound traffic over short
|
| 45 |
+
time window, correlated with abnormal resource usage or degraded host responsiveness.'
|
| 46 |
+
- 'Email is unsafe -- deal with it.
|
| 47 |
+
|
| 48 |
+
Email can be made safe for an adequately defined value of "safe", through the
|
| 49 |
+
use of signatures (S/MIME or OpenPGP). This is not as easy as it seems (I mean,
|
| 50 |
+
it does not look easy, but in reality it is worse). The cornerstone of the system
|
| 51 |
+
is that unsigned emails should be rejected automatically; human users should never
|
| 52 |
+
see them at all, because if they read them, they will always believe them a little,
|
| 53 |
+
regardless of how much you may have explained to them how insecure and unsafe
|
| 54 |
+
plain emails are. Therefore, switching to signed emails is like a big jump into
|
| 55 |
+
the unknown. In practice, it is essentially a way to break emails (or to induce
|
| 56 |
+
users to switch to gmail...).
|
| 57 |
+
|
| 58 |
+
What you can do is to educate and then to educate again:
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
The smooth education: explain to your users how untrustworthy email is as a medium.
|
| 62 |
+
Show how easy it is to forge an email (e.g. with this answer). Try to prevent
|
| 63 |
+
the "wizardry effect" which makes most human beings lose common sense as soon
|
| 64 |
+
as a computer is involved (as Clarke was putting it, computers are beyond the
|
| 65 |
+
"magical horizon" of most people -- solution is to make them understand how a
|
| 66 |
+
computer works). As a bonus, this makes the users more resilient to phishing.
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
The less smooth education: let all the might of the Law fall on wannabe fraudsters.
|
| 70 |
+
Have it known that the slightest phony game with email is a shooting offense;
|
| 71 |
+
the guilty will be fired, jailed, shot and flogged (not necessarily in that order).
|
| 72 |
+
The idea is to make faking emails not worth it. This works well: this is how the
|
| 73 |
+
non-computer world deals with handwritten signatures, and it has done so for several
|
| 74 |
+
centuries.'
|
| 75 |
+
- source_sentence: "Man in the middle with certificate signed by known CA and DNS\
|
| 76 |
+
\ spoofing can work?\n\nAssuming that I have a certificate for my local webserver\
|
| 77 |
+
\ signed by a well known CA like Verisign (so it will be trusted by browsers),\
|
| 78 |
+
\ and I'm able to DNS poisoning in Man-in-the-Middle to redirect a user who wants\
|
| 79 |
+
\ to go to google.com on my local webserver who has the same hostname google.com,\
|
| 80 |
+
\ what will be the consequences? \nIs there a risk? If so, how to prevent it?\n\
|
| 81 |
+
I don't know why SSL pinning is only used for mobile app. If the attack above\
|
| 82 |
+
\ works, so SSL pinning can be a prevention. But it's not used on computer..."
|
| 83 |
+
sentences:
|
| 84 |
+
- 'Modern browsers do the smart thing: they ask the operating system. The OS interacts
|
| 85 |
+
with hardware all day long; that''s its main purpose. So it is in the right place
|
| 86 |
+
to gather randomness and mix it with a properly secure cryptographic random number
|
| 87 |
+
generator. On Windows systems, this is made available to application though the
|
| 88 |
+
CryptGenRandom() function. Linux has the special file /dev/urandom for that. Some
|
| 89 |
+
programming languages offer their own API for that (which internally feeds on
|
| 90 |
+
the OS), e.g. java.security.SecureRandom for Java.
|
| 91 |
+
|
| 92 |
+
The hardware sources that the OS may employ primarily includes the cycle-accurate
|
| 93 |
+
time at which hardware events occur (e.g. the precise nanosecond at which you
|
| 94 |
+
press a key, or a network packet is received). The OS can thus get "enough" true
|
| 95 |
+
randomness to power a RNG, which will then yield good alea by the megabyte ("good"
|
| 96 |
+
= "indistinguishable from true randomness, up to an overwhelming work factor"
|
| 97 |
+
= "good for crypto").'
|
| 98 |
+
- "In order for that to succeed the server's hostname would have to match the certificate\
|
| 99 |
+
\ name. That means you would have to either get a CA to issue a cert as google.com\
|
| 100 |
+
\ (not likely to happen) or you would have to get a root cert from a CA you control\
|
| 101 |
+
\ and install that on the user's computer as a trusted CA certificate. \nEven\
|
| 102 |
+
\ then, many big web sites use public key pinning which forces the browser to\
|
| 103 |
+
\ only accept a specific cert until a pre-stated expiration date. Both Chromium\
|
| 104 |
+
\ and Firefox do support pinning."
|
| 105 |
+
- "How is the VPN tunnel established? My computer and the end VPN server\n need\
|
| 106 |
+
\ to exchange some info to establish a tunnel? (some kind of\n handshake or do\
|
| 107 |
+
\ I miss something?)\n\nYou need a client application on your laptop for the VPN\
|
| 108 |
+
\ that your corporation uses. The client application will perform the handshake\
|
| 109 |
+
\ for you in order to establish a secure connection to your corporation's network.\n\
|
| 110 |
+
\nHow VPN packet finds a way to the destination machine if the entire\n ip-datagram\
|
| 111 |
+
\ is encrypted? I understand that the packet is is\n encapsulated within another\
|
| 112 |
+
\ packet, but the source and destination IP\n must be the same or?\n\nYour client\
|
| 113 |
+
\ application will likely install some type of virtual network adapter. This\
|
| 114 |
+
\ adapter will translate incoming and outgoing packets based on some type of session\
|
| 115 |
+
\ ID and/or traffic characteristics (for example, port numbers).\n\nIf someone\
|
| 116 |
+
\ between 2 routers sniffs the packets, they can see that the\n packet goes from\
|
| 117 |
+
\ restaurantRouter to routerCorp? right? Or they cannot\n see the full path?\n\
|
| 118 |
+
\nYes they will be able to see the IP of routerCorp.\n\nIf someone between MyComp\
|
| 119 |
+
\ and restaurantRouter sniffs the packets,\n what they can see? Can they see\
|
| 120 |
+
\ source and destination ip?\n\nYes they will see the source and destination IPs,\
|
| 121 |
+
\ but the data is encrypted.\n\nDoes the restaurantRouter or the ISP knows the\
|
| 122 |
+
\ content of the packets\n or are they already encrypted when they reach the\
|
| 123 |
+
\ restaurantRouter ?\n\nThey are encrypted on your computer before being sent\
|
| 124 |
+
\ out to restaurantRouter.\n\nDo the packets travel same path or this has nothing\
|
| 125 |
+
\ to do with the\n tunneling?\n\nTechnically there's no guarantee that the same\
|
| 126 |
+
\ hop path will be taken. \nTunneling simply means that you've encapsulated one\
|
| 127 |
+
\ protocol with another protocol. In this case, you've encapsulated your data\
|
| 128 |
+
\ using a secure protocol (VPN client), and you're using IP/TCP to transmit this\
|
| 129 |
+
\ secure data."
|
| 130 |
+
- source_sentence: "HTML iframe security exploit\n\nCan somebody explain if there's\
|
| 131 |
+
\ something malicious in this HTML, The width and height of the iframe ares set\
|
| 132 |
+
\ to 1 and it's out of the view with top: -100px; statement. I believe it takes\
|
| 133 |
+
\ some some sort of permission from Google account authentication without the\
|
| 134 |
+
\ consent of the user?\n<iframe name=\"oauth2relay255885454\" idstrong text=\"\
|
| 135 |
+
oauth2relay255885454\" \n src=\"https://accounts.google.com/o/oauth2/postmessageRelay?parent=http%3A%2F%2Fjankestauorg.blogspot.com#rpctoken=831371525&forcesecure=1\"\
|
| 136 |
+
\ \n tabindex=\"-1\" style=\"width: 1px; height: 1px; position: absolute;\
|
| 137 |
+
\ top: -100px;\">\n\n <!DOCTYPE html>\n <html>\n <head><title></title>\n\
|
| 138 |
+
\ <meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"\
|
| 139 |
+
>\n <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\"><meta\
|
| 140 |
+
\ name=\"viewport\" content=\"width=device-width, initial-scale=1, minimum-scale=1,\
|
| 141 |
+
\ maximum-scale=1, user-scalable=0\">\n <script type=\"text/javascript\"\
|
| 142 |
+
\ src=\"https://apis.google.com/js/api.js\" gapi_processed=\"true\"></script>\n\
|
| 143 |
+
\ <script type=\"text/javascript\" src=\"https://oauth.googleusercontent.com/gadgets/js/core:rpc:shindig.random:shindig.sha1.js?c=2\"\
|
| 144 |
+
></script>\n <script src=\"https://ssl.gstatic.com/accounts/o/3584451713-postmessagerelay.js\"\
|
| 145 |
+
></script>\n </head><body></body>\n </html>\n</iframe>"
|
| 146 |
+
sentences:
|
| 147 |
+
- 'The attack as you described it is not possible. This is the impossible part:
|
| 148 |
+
"that password you''ve provided me, that''ll do just fine". In the Wi-Fi protocol,
|
| 149 |
+
the client doesn''t send the password to the AP. Instead, both the client and
|
| 150 |
+
AP derive an encryption key from it, and they mutually authenticate by making
|
| 151 |
+
sure they can understand each other''s messages.'
|
| 152 |
+
- 'These are separate rules, they are checked each time a new connection arrives.
|
| 153 |
+
In your case you have two rules:
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
allowing all incoming connections on port 22 with both TCP and UDP protocols
|
| 157 |
+
|
| 158 |
+
same but it only applies to connections on the TCP protocol, which is not needed
|
| 159 |
+
in your case, because you have a rule with high priority (1 > 2) that allows traffic
|
| 160 |
+
on both protocols.
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
If you want to secure your SSH server, you can do it by allowing only specific
|
| 164 |
+
IP addresses to access port 22, use port knocking or similar methods.'
|
| 165 |
+
- 'It''s from including a Google+ button. You''re sort of right though - it''s Google
|
| 166 |
+
seeing if you have a Google+ account in order to let you share the page on Google+,
|
| 167 |
+
or potentially show a different button if you''ve already shared it.
|
| 168 |
+
|
| 169 |
+
However, only Google gets to see the data.
|
| 170 |
+
|
| 171 |
+
The entire frame is generated by Google, including the positioning. It is placed
|
| 172 |
+
off-screen in order to avoid messing up layout of the page.'
|
| 173 |
+
- source_sentence: 'Are 6 digit number verifications secure?
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
I''ve noticed that many services and/or platforms now send 6 digit codes to verify
|
| 177 |
+
user actions. Typically sent to an email. With these being 6 digits long & assuming
|
| 178 |
+
that you get at least 3 tries to enter there''s a 1 in 333,333 chance that a malicious
|
| 179 |
+
user could correctly guess the digits.
|
| 180 |
+
|
| 181 |
+
Seems low to me. Is this enough security?'
|
| 182 |
+
sentences:
|
| 183 |
+
- 'Verification digits secure against a compromised account being used to perform
|
| 184 |
+
certain actions.
|
| 185 |
+
|
| 186 |
+
So, if a malicious person was able to compromise the account, they would also
|
| 187 |
+
need to compromise the channel used to transmit the verification code.
|
| 188 |
+
|
| 189 |
+
Or guess the code.
|
| 190 |
+
|
| 191 |
+
Guessing the code is so unlikely that it provides sufficient verification. If
|
| 192 |
+
there are attempts to guess, then other analysis starts to kick in to determine
|
| 193 |
+
if the account is suspicious. So, it''s not just that the code might be accepted,
|
| 194 |
+
but all the metadata around the code entry can be used to prove that someone guessed
|
| 195 |
+
the code.
|
| 196 |
+
|
| 197 |
+
So, yes, until this method is shown to be insecure, it is secure enough. Even
|
| 198 |
+
a 4-digit code could be deemed "secure enough", depending on the complimentary
|
| 199 |
+
controls.'
|
| 200 |
+
- 'C0017
|
| 201 |
+
|
| 202 |
+
C0017 was an APT41 campaign conducted between May 2021 and February 2022 that
|
| 203 |
+
successfully compromised at least six U.S. state government networks through the
|
| 204 |
+
exploitation of vulnerable Internet facing web applications. During C0017, APT41
|
| 205 |
+
was quick to adapt and use publicly-disclosed as well as zero-day vulnerabilities
|
| 206 |
+
for initial access, and in at least two cases re-compromised victims following
|
| 207 |
+
remediation efforts. The goals of C0017 are unknown, however APT41 was observed
|
| 208 |
+
exfiltrating Personal Identifiable Information (PII).[1]
|
| 209 |
+
|
| 210 |
+
During C0017, APT41 downloaded malicious payloads onto compromised systems.[107]'
|
| 211 |
+
- 'If the site with the instructions is compromised the attacker can replace all
|
| 212 |
+
instructions. This means that he can specify a different key to use for verification
|
| 213 |
+
or omit the verification information completely, that he can specify a different
|
| 214 |
+
URL for download etc. That such a compromise can happen shows the hack of the
|
| 215 |
+
Linux Mint site in 2016 which was changed to point to compromised download at
|
| 216 |
+
this time.
|
| 217 |
+
|
| 218 |
+
A site compromise can actually quickly be detected if a site is monitored for
|
| 219 |
+
changes, which is hopefully the case for important sites. But an attacker does
|
| 220 |
+
not even need to compromise the original site. Instead the attacker could buy
|
| 221 |
+
ads for search engines which direct you to "alternative" download sites if you
|
| 222 |
+
are looking for the downloads. These download sites are in full control of the
|
| 223 |
+
attacker so he can publish anything there. Such alternative sites are also not
|
| 224 |
+
detected when monitoring the original site. This type of social attack is very
|
| 225 |
+
common with downloads of openoffice or similar where a simple download openoffice
|
| 226 |
+
might lead to a result page which is full of advertisements pointing to download
|
| 227 |
+
sites. These then usually provide "enhanced" installs which add additional software
|
| 228 |
+
but could also provide a compromised version.
|
| 229 |
+
|
| 230 |
+
There is not a lot one can do about this. The underlying question to all of this
|
| 231 |
+
is whom you can trust. Even if your new computer comes pre-installed with Windows
|
| 232 |
+
you have to still trust the vendor that the version you have is the original one
|
| 233 |
+
and that no harmful software was installed additionally. History shows that this
|
| 234 |
+
is not always the case and that even seemingly trustable vendors add harmful software.'
|
| 235 |
+
- source_sentence: 'Scheduled Task
|
| 236 |
+
|
| 237 |
+
Adversaries may abuse the Windows Task Scheduler to perform task scheduling for
|
| 238 |
+
initial or recurring execution of malicious code. There are multiple ways to access
|
| 239 |
+
the Task Scheduler in Windows. The schtasks utility can be run directly on the
|
| 240 |
+
command line, or the Task Scheduler can be opened through the GUI within the Administrator
|
| 241 |
+
Tools section of the Control Panel.[1] In some cases, adversaries have used a
|
| 242 |
+
.NET wrapper for the Windows Task Scheduler, and alternatively, adversaries have
|
| 243 |
+
used the Windows netapi32 library and Windows Management Instrumentation (WMI)
|
| 244 |
+
to create a scheduled task. Adversaries may also utilize the Powershell Cmdlet
|
| 245 |
+
Invoke-CimMethod, which leverages WMI class PS_ScheduledTask to create a scheduled
|
| 246 |
+
task via an XML path.[2] An adversary may use Windows Task Scheduler to execute
|
| 247 |
+
programs at system startup or on a scheduled basis for persistence. The Windows
|
| 248 |
+
Task Scheduler can also be abused to conduct remote Execution as part of Lateral
|
| 249 |
+
Movement and/or to run a process under the context of a specified account (such
|
| 250 |
+
as SYSTEM). Similar to System Binary Proxy Execution, adversaries have also abused
|
| 251 |
+
the Windows Task Scheduler to potentially mask one-time execution under signed/trusted
|
| 252 |
+
system processes.[3] Adversaries may also create "hidden" scheduled tasks (i.e.
|
| 253 |
+
Hide Artifacts) that may not be visible to defender tools and manual queries used
|
| 254 |
+
to enumerate tasks. Specifically, an adversary may hide a task from schtasks /query
|
| 255 |
+
and the Task Scheduler by deleting the associated Security Descriptor (SD) registry
|
| 256 |
+
value (where deletion of this value must be completed using SYSTEM permissions).[4][5]
|
| 257 |
+
Adversaries may also employ alternate methods to hide tasks, such as altering
|
| 258 |
+
the metadata (e.g., Index value) within associated registry keys.[6]'
|
| 259 |
+
sentences:
|
| 260 |
+
- 'RFC 4880, the standard for the format of PGP messages, says:
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
The high 16 bits (first two octets) of the hash are included in the
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
Signature packet to provide a quick test to reject some invalid
|
| 267 |
+
|
| 268 |
+
signatures.
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
However, you are thinking it wrong. Signatures are not encryption, and signatures
|
| 272 |
+
are not encrypted. In fact, given the value of the public key (which is public)
|
| 273 |
+
and the signature itself, one can recompute the complete hash value of the message
|
| 274 |
+
which is signed (at least for RSA, which is technically known as a signature algorithm
|
| 275 |
+
with recovery). The first 16 bits are just a helper so that software can avoid
|
| 276 |
+
many modular exponentiations when it is looking for the "correct" public key among
|
| 277 |
+
a set of candidates; they save a few milliseconds worth of computation, that''s
|
| 278 |
+
all.
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
A generic note is that signatures can leak information on that which is signed;
|
| 282 |
+
so if you sign and encrypt a confidential message, then you should, conceptually,
|
| 283 |
+
either sign the encrypted message, or encrypt the signature along with the message
|
| 284 |
+
contents as well. OpenPGP uses the latter method.
|
| 285 |
+
|
| 286 |
+
When a message is just signed, not encrypted, then it makes no sense to hide the
|
| 287 |
+
message hash, since the message itself, by definition, is transmitted as cleartext.'
|
| 288 |
+
- 'BlackByte
|
| 289 |
+
|
| 290 |
+
BlackByte is a ransomware threat actor operating since at least 2021. BlackByte
|
| 291 |
+
is associated with several versions of ransomware also labeled BlackByte Ransomware.
|
| 292 |
+
BlackByte ransomware operations initially used a common encryption key allowing
|
| 293 |
+
for the development of a universal decryptor, but subsequent versions such as
|
| 294 |
+
BlackByte 2.0 Ransomware use more robust encryption mechanisms. BlackByte is notable
|
| 295 |
+
for operations targeting critical infrastructure entities among other targets
|
| 296 |
+
across North America.[1][2][3][4][5]
|
| 297 |
+
|
| 298 |
+
BlackByte created scheduled tasks for payload execution.[38][39]'
|
| 299 |
+
- 'APT32
|
| 300 |
+
|
| 301 |
+
APT32 is a suspected Vietnam-based threat group that has been active since at
|
| 302 |
+
least 2014. The group has targeted multiple private sector industries as well
|
| 303 |
+
as foreign governments, dissidents, and journalists with a strong focus on Southeast
|
| 304 |
+
Asian countries like Vietnam, the Philippines, Laos, and Cambodia. They have extensively
|
| 305 |
+
used strategic web compromises to compromise victims.[1][2][3]
|
| 306 |
+
|
| 307 |
+
APT32 used the net view command to show all shares available, including the administrative
|
| 308 |
+
shares such as C$ and ADMIN$.[5]'
|
| 309 |
+
pipeline_tag: sentence-similarity
|
| 310 |
+
library_name: sentence-transformers
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
# SentenceTransformer based on Qwen/Qwen3-Embedding-0.6B
|
| 314 |
+
|
| 315 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B) on the json dataset. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
| 316 |
+
|
| 317 |
+
## Model Details
|
| 318 |
+
|
| 319 |
+
### Model Description
|
| 320 |
+
- **Model Type:** Sentence Transformer
|
| 321 |
+
- **Base model:** [Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B) <!-- at revision c54f2e6e80b2d7b7de06f51cec4959f6b3e03418 -->
|
| 322 |
+
- **Maximum Sequence Length:** 2048 tokens
|
| 323 |
+
- **Output Dimensionality:** 1024 dimensions
|
| 324 |
+
- **Similarity Function:** Cosine Similarity
|
| 325 |
+
- **Training Dataset:**
|
| 326 |
+
- json
|
| 327 |
+
<!-- - **Language:** Unknown -->
|
| 328 |
+
<!-- - **License:** Unknown -->
|
| 329 |
+
|
| 330 |
+
### Model Sources
|
| 331 |
+
|
| 332 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
| 333 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
|
| 334 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
| 335 |
+
|
| 336 |
+
### Full Model Architecture
|
| 337 |
+
|
| 338 |
+
```
|
| 339 |
+
SentenceTransformer(
|
| 340 |
+
(0): Transformer({'max_seq_length': 2048, 'do_lower_case': False, 'architecture': 'Qwen3Model'})
|
| 341 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
|
| 342 |
+
(2): Normalize()
|
| 343 |
+
)
|
| 344 |
+
```
|
| 345 |
+
|
| 346 |
+
## Usage
|
| 347 |
+
|
| 348 |
+
### Direct Usage (Sentence Transformers)
|
| 349 |
+
|
| 350 |
+
First install the Sentence Transformers library:
|
| 351 |
+
|
| 352 |
+
```bash
|
| 353 |
+
pip install -U sentence-transformers
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
Then you can load this model and run inference.
|
| 357 |
+
```python
|
| 358 |
+
from sentence_transformers import SentenceTransformer
|
| 359 |
+
|
| 360 |
+
# Download from the 🤗 Hub
|
| 361 |
+
model = SentenceTransformer("ThienLe/Qwen3-SecEmbed")
|
| 362 |
+
# Run inference
|
| 363 |
+
queries = [
|
| 364 |
+
"Scheduled Task\nAdversaries may abuse the Windows Task Scheduler to perform task scheduling for initial or recurring execution of malicious code. There are multiple ways to access the Task Scheduler in Windows. The schtasks utility can be run directly on the command line, or the Task Scheduler can be opened through the GUI within the Administrator Tools section of the Control Panel.[1] In some cases, adversaries have used a .NET wrapper for the Windows Task Scheduler, and alternatively, adversaries have used the Windows netapi32 library and Windows Management Instrumentation (WMI) to create a scheduled task. Adversaries may also utilize the Powershell Cmdlet Invoke-CimMethod, which leverages WMI class PS_ScheduledTask to create a scheduled task via an XML path.[2] An adversary may use Windows Task Scheduler to execute programs at system startup or on a scheduled basis for persistence. The Windows Task Scheduler can also be abused to conduct remote Execution as part of Lateral Movement and/or to run a process under the context of a specified account (such as SYSTEM). Similar to System Binary Proxy Execution, adversaries have also abused the Windows Task Scheduler to potentially mask one-time execution under signed/trusted system processes.[3] Adversaries may also create \"hidden\" scheduled tasks (i.e. Hide Artifacts) that may not be visible to defender tools and manual queries used to enumerate tasks. Specifically, an adversary may hide a task from schtasks /query and the Task Scheduler by deleting the associated Security Descriptor (SD) registry value (where deletion of this value must be completed using SYSTEM permissions).[4][5] Adversaries may also employ alternate methods to hide tasks, such as altering the metadata (e.g., Index value) within associated registry keys.[6]",
|
| 365 |
+
]
|
| 366 |
+
documents = [
|
| 367 |
+
'BlackByte\nBlackByte is a ransomware threat actor operating since at least 2021. BlackByte is associated with several versions of ransomware also labeled BlackByte Ransomware. BlackByte ransomware operations initially used a common encryption key allowing for the development of a universal decryptor, but subsequent versions such as BlackByte 2.0 Ransomware use more robust encryption mechanisms. BlackByte is notable for operations targeting critical infrastructure entities among other targets across North America.[1][2][3][4][5]\nBlackByte created scheduled tasks for payload execution.[38][39]',
|
| 368 |
+
'APT32\nAPT32 is a suspected Vietnam-based threat group that has been active since at least 2014. The group has targeted multiple private sector industries as well as foreign governments, dissidents, and journalists with a strong focus on Southeast Asian countries like Vietnam, the Philippines, Laos, and Cambodia. They have extensively used strategic web compromises to compromise victims.[1][2][3]\nAPT32 used the net view command to show all shares available, including the administrative shares such as C$ and ADMIN$.[5]',
|
| 369 |
+
'RFC 4880, the standard for the format of PGP messages, says:\n\nThe high 16 bits (first two octets) of the hash are included in the\n\nSignature packet to provide a quick test to reject some invalid\nsignatures.\n\nHowever, you are thinking it wrong. Signatures are not encryption, and signatures are not encrypted. In fact, given the value of the public key (which is public) and the signature itself, one can recompute the complete hash value of the message which is signed (at least for RSA, which is technically known as a signature algorithm with recovery). The first 16 bits are just a helper so that software can avoid many modular exponentiations when it is looking for the "correct" public key among a set of candidates; they save a few milliseconds worth of computation, that\'s all.\n\nA generic note is that signatures can leak information on that which is signed; so if you sign and encrypt a confidential message, then you should, conceptually, either sign the encrypted message, or encrypt the signature along with the message contents as well. OpenPGP uses the latter method.\nWhen a message is just signed, not encrypted, then it makes no sense to hide the message hash, since the message itself, by definition, is transmitted as cleartext.',
|
| 370 |
+
]
|
| 371 |
+
query_embeddings = model.encode_query(queries)
|
| 372 |
+
document_embeddings = model.encode_document(documents)
|
| 373 |
+
print(query_embeddings.shape, document_embeddings.shape)
|
| 374 |
+
# [1, 1024] [3, 1024]
|
| 375 |
+
|
| 376 |
+
# Get the similarity scores for the embeddings
|
| 377 |
+
similarities = model.similarity(query_embeddings, document_embeddings)
|
| 378 |
+
print(similarities)
|
| 379 |
+
# tensor([[ 0.8511, 0.1452, -0.0968]])
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
<!--
|
| 383 |
+
### Direct Usage (Transformers)
|
| 384 |
+
|
| 385 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
| 386 |
+
|
| 387 |
+
</details>
|
| 388 |
+
-->
|
| 389 |
+
|
| 390 |
+
<!--
|
| 391 |
+
### Downstream Usage (Sentence Transformers)
|
| 392 |
+
|
| 393 |
+
You can finetune this model on your own dataset.
|
| 394 |
+
|
| 395 |
+
<details><summary>Click to expand</summary>
|
| 396 |
+
|
| 397 |
+
</details>
|
| 398 |
+
-->
|
| 399 |
+
|
| 400 |
+
<!--
|
| 401 |
+
### Out-of-Scope Use
|
| 402 |
+
|
| 403 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 404 |
+
-->
|
| 405 |
+
|
| 406 |
+
<!--
|
| 407 |
+
## Bias, Risks and Limitations
|
| 408 |
+
|
| 409 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 410 |
+
-->
|
| 411 |
+
|
| 412 |
+
<!--
|
| 413 |
+
### Recommendations
|
| 414 |
+
|
| 415 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 416 |
+
-->
|
| 417 |
+
|
| 418 |
+
## Training Details
|
| 419 |
+
|
| 420 |
+
### Training Dataset
|
| 421 |
+
|
| 422 |
+
#### json
|
| 423 |
+
|
| 424 |
+
* Dataset: json
|
| 425 |
+
* Size: 49,346 training samples
|
| 426 |
+
* Columns: <code>sentence1</code> and <code>sentence2</code>
|
| 427 |
+
* Approximate statistics based on the first 1000 samples:
|
| 428 |
+
| | sentence1 | sentence2 |
|
| 429 |
+
|:--------|:--------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------|
|
| 430 |
+
| type | string | string |
|
| 431 |
+
| details | <ul><li>min: 27 tokens</li><li>mean: 220.82 tokens</li><li>max: 2048 tokens</li></ul> | <ul><li>min: 13 tokens</li><li>mean: 220.84 tokens</li><li>max: 1945 tokens</li></ul> |
|
| 432 |
+
* Samples:
|
| 433 |
+
| sentence1 | sentence2 |
|
| 434 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 435 |
+
| <code>Is it possible to send data from an open-source program but make it impossible for a user with source code to do the same?<br><br>If I want to store a global scoreboard for a game running locally on the user's computer and I want to make sure that all the requests coming to the server are really generated by the game and not spoofed by the users, is there anything that can be done to prevent cheating? The program is open-source, so no obfuscation can be used.<br>The problem I have with coming up with a solution is that whatever I choose to implement in code must necessarily include all the components necessary for creating a spoofed program that can send any data user wants, especially any encryption keys and hashing algorithms used.</code> | <code>In a word, no. The best option is to move all the actual game logic to the server, and have the client be a thin client that just displays state and sends input. That wouldn't prevent various types of cheating (such as game automation tools) but it's the only way to comprehensively avoid the user sending fraudulent game results. It's a popular approach in multi-player, too, as it avoids telling the user anything they don't need to know. However, it's considerably more expensive in server hardware.<br>The next option is to make it possible to validate high scores. One option would be to record, for every game, the RNG seed value followed by every input that the user provides combined with a timestamp of some sort (for turn-based games this is just an action count; for real-time games it would probably be a game engine frame/tick count). Transmitting all of that back to the server would be a lot, but - combined with the game version and level or whatever - allow the server to effectively re...</code> |
|
| 436 |
+
| <code>Encrypted/Encoded File<br>Adversaries may encrypt or encode files to obfuscate strings, bytes, and other specific patterns to impede detection. Encrypting and/or encoding file content aims to conceal malicious artifacts within a file used in an intrusion. Many other techniques, such as Software Packing, Steganography, and Embedded Payloads, share this same broad objective. Encrypting and/or encoding files could lead to a lapse in detection of static signatures, only for this malicious content to be revealed (i.e., Deobfuscate/Decode Files or Information) at the time of execution/use. This type of file obfuscation can be applied to many file artifacts present on victim hosts, such as malware log/configuration and payload files.[1] Files can be encrypted with a hardcoded or user-supplied key, as well as otherwise obfuscated using standard encoding schemes such as Base64. The entire content of a file may be obfuscated, or just specific functions or values (such as C2 addresses). Encryption a...</code> | <code>Dark Caracal<br>Dark Caracal is threat group that has been attributed to the Lebanese General Directorate of General Security (GDGS) and has operated since at least 2012. [1]<br>Dark Caracal has obfuscated strings in Bandook by base64 encoding, and then encrypting them.[58]</code> |
|
| 437 |
+
| <code>OS Credential Dumping<br>Adversaries may attempt to dump credentials to obtain account login and credential material, normally in the form of a hash or a clear text password. Credentials can be obtained from OS caches, memory, or structures.[1] Credentials can then be used to perform Lateral Movement and access restricted information. Several of the tools mentioned in associated sub-techniques may be used by both adversaries and professional security testers. Additional custom tools likely exist as well.</code> | <code>Ember Bear<br>Ember Bear is a Russian state-sponsored cyber espionage group that has been active since at least 2020, linked to Russia's General Staff Main Intelligence Directorate (GRU) 161st Specialist Training Center (Unit 29155).[1] Ember Bear has primarily focused operations against Ukrainian government and telecommunication entities, but has also operated against critical infrastructure entities in Europe and the Americas.[2] Ember Bear conducted the WhisperGate destructive wiper attacks against Ukraine in early 2022.[3][4][1] There is some confusion as to whether Ember Bear overlaps with another Russian-linked entity referred to as Saint Bear. At present available evidence strongly suggests these are distinct activities with different behavioral profiles.[2][5]<br>Ember Bear gathers credential material from target systems, such as SSH keys, to facilitate access to victim environments.[12]</code> |
|
| 438 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
| 439 |
+
```json
|
| 440 |
+
{
|
| 441 |
+
"scale": 20.0,
|
| 442 |
+
"similarity_fct": "cos_sim",
|
| 443 |
+
"gather_across_devices": false
|
| 444 |
+
}
|
| 445 |
+
```
|
| 446 |
+
|
| 447 |
+
### Evaluation Dataset
|
| 448 |
+
|
| 449 |
+
#### json
|
| 450 |
+
|
| 451 |
+
* Dataset: json
|
| 452 |
+
* Size: 12,337 evaluation samples
|
| 453 |
+
* Columns: <code>sentence1</code> and <code>sentence2</code>
|
| 454 |
+
* Approximate statistics based on the first 1000 samples:
|
| 455 |
+
| | sentence1 | sentence2 |
|
| 456 |
+
|:--------|:--------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------|
|
| 457 |
+
| type | string | string |
|
| 458 |
+
| details | <ul><li>min: 13 tokens</li><li>mean: 222.63 tokens</li><li>max: 2048 tokens</li></ul> | <ul><li>min: 13 tokens</li><li>mean: 226.13 tokens</li><li>max: 2048 tokens</li></ul> |
|
| 459 |
+
* Samples:
|
| 460 |
+
| sentence1 | sentence2 |
|
| 461 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 462 |
+
| <code>Security Account Manager<br>Adversaries may attempt to extract credential material from the Security Account Manager (SAM) database either through in-memory techniques or through the Windows Registry where the SAM database is stored. The SAM is a database file that contains local accounts for the host, typically those found with the net user command. Enumerating the SAM database requires SYSTEM level access. A number of tools can be used to retrieve the SAM file through in-memory techniques: Alternatively, the SAM can be extracted from the Registry with Reg: Creddump7 can then be used to process the SAM database locally to retrieve hashes.[1] Notes:</code> | <code>APT29<br>APT29 is threat group that has been attributed to Russia's Foreign Intelligence Service (SVR).[1][2] They have operated since at least 2008, often targeting government networks in Europe and NATO member countries, research institutes, and think tanks. APT29 reportedly compromised the Democratic National Committee starting in the summer of 2015.[3][4][5][6] In April 2021, the US and UK governments attributed the SolarWinds Compromise to the SVR; public statements included citations to APT29, Cozy Bear, and The Dukes.[7][8] Industry reporting also referred to the actors involved in this campaign as UNC2452, NOBELIUM, StellarParticle, Dark Halo, and SolarStorm.[9][10][11][12][13][14]<br>APT29 has used the reg save command to save registry hives.[4]</code> |
|
| 463 |
+
| <code>Why don't we use MAC address instead of IP address?<br><br>I can use the system function in PHP to get the MAC address of site visitors (probably most of you know). Why do we use IP addresss to check whether someone is stealing a cookie or not?<br>Does the system function have more overhead, or is it still insecure when we don't send any parameter to the function?<br>I know there are some situations in which users change their MAC address, but it happens less than IP address.<br>Could you shed some light on it?</code> | <code>The reason for that is very simple: You won't get the MAC address of your website visitor over the Internet, because they are lost when the packets are routed. You can only get the MAC addresses from your subnet (through, for example, ARP).</code> |
|
| 464 |
+
| <code>Native API<br>Adversaries may interact with the native OS application programming interface (API) to execute behaviors. Native APIs provide a controlled means of calling low-level OS services within the kernel, such as those involving hardware/devices, memory, and processes.[1][2] These native APIs are leveraged by the OS during system boot (when other system components are not yet initialized) as well as carrying out tasks and requests during routine operations. Adversaries may abuse these OS API functions as a means of executing behaviors. Similar to Command and Scripting Interpreter, the native API and its hierarchy of interfaces provide mechanisms to interact with and utilize various components of a victimized system. Native API functions (such as NtCreateProcess) may be directed invoked via system calls / syscalls, but these features are also often exposed to user-mode applications via interfaces and libraries.[3][4][5] For example, functions such as the Windows API CreateProcess() o...</code> | <code>Kapeka<br>Kapeka is a backdoor written in C++ used against victims in Eastern Europe since at least mid-2022. Kapeka has technical overlaps with Exaramel for Windows and Prestige malware variants, both of which are linked to Sandworm Team. Kapeka may have been used in advance of Prestige deployment in late 2022.[1][2]<br>Kapeka utilizes WinAPI calls to gather victim system information.[124]</code> |
|
| 465 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
| 466 |
+
```json
|
| 467 |
+
{
|
| 468 |
+
"scale": 20.0,
|
| 469 |
+
"similarity_fct": "cos_sim",
|
| 470 |
+
"gather_across_devices": false
|
| 471 |
+
}
|
| 472 |
+
```
|
| 473 |
+
|
| 474 |
+
### Training Hyperparameters
|
| 475 |
+
#### Non-Default Hyperparameters
|
| 476 |
+
|
| 477 |
+
- `per_device_train_batch_size`: 16
|
| 478 |
+
- `num_train_epochs`: 1
|
| 479 |
+
- `warmup_steps`: 300
|
| 480 |
+
- `optim`: adamw_8bit
|
| 481 |
+
- `gradient_accumulation_steps`: 2
|
| 482 |
+
- `bf16`: True
|
| 483 |
+
- `gradient_checkpointing`: True
|
| 484 |
+
- `eval_strategy`: steps
|
| 485 |
+
- `per_device_eval_batch_size`: 16
|
| 486 |
+
- `dataloader_num_workers`: 4
|
| 487 |
+
- `dataloader_pin_memory`: False
|
| 488 |
+
|
| 489 |
+
#### All Hyperparameters
|
| 490 |
+
<details><summary>Click to expand</summary>
|
| 491 |
+
|
| 492 |
+
- `per_device_train_batch_size`: 16
|
| 493 |
+
- `num_train_epochs`: 1
|
| 494 |
+
- `max_steps`: -1
|
| 495 |
+
- `learning_rate`: 5e-05
|
| 496 |
+
- `lr_scheduler_type`: linear
|
| 497 |
+
- `lr_scheduler_kwargs`: None
|
| 498 |
+
- `warmup_steps`: 300
|
| 499 |
+
- `optim`: adamw_8bit
|
| 500 |
+
- `optim_args`: None
|
| 501 |
+
- `weight_decay`: 0.0
|
| 502 |
+
- `adam_beta1`: 0.9
|
| 503 |
+
- `adam_beta2`: 0.999
|
| 504 |
+
- `adam_epsilon`: 1e-08
|
| 505 |
+
- `optim_target_modules`: None
|
| 506 |
+
- `gradient_accumulation_steps`: 2
|
| 507 |
+
- `average_tokens_across_devices`: True
|
| 508 |
+
- `max_grad_norm`: 1.0
|
| 509 |
+
- `label_smoothing_factor`: 0.0
|
| 510 |
+
- `bf16`: True
|
| 511 |
+
- `fp16`: False
|
| 512 |
+
- `bf16_full_eval`: False
|
| 513 |
+
- `fp16_full_eval`: False
|
| 514 |
+
- `tf32`: None
|
| 515 |
+
- `gradient_checkpointing`: True
|
| 516 |
+
- `gradient_checkpointing_kwargs`: None
|
| 517 |
+
- `torch_compile`: False
|
| 518 |
+
- `torch_compile_backend`: None
|
| 519 |
+
- `torch_compile_mode`: None
|
| 520 |
+
- `use_liger_kernel`: False
|
| 521 |
+
- `liger_kernel_config`: None
|
| 522 |
+
- `use_cache`: False
|
| 523 |
+
- `neftune_noise_alpha`: None
|
| 524 |
+
- `torch_empty_cache_steps`: None
|
| 525 |
+
- `auto_find_batch_size`: False
|
| 526 |
+
- `log_on_each_node`: True
|
| 527 |
+
- `logging_nan_inf_filter`: True
|
| 528 |
+
- `include_num_input_tokens_seen`: no
|
| 529 |
+
- `log_level`: passive
|
| 530 |
+
- `log_level_replica`: warning
|
| 531 |
+
- `disable_tqdm`: False
|
| 532 |
+
- `project`: huggingface
|
| 533 |
+
- `trackio_space_id`: trackio
|
| 534 |
+
- `eval_strategy`: steps
|
| 535 |
+
- `per_device_eval_batch_size`: 16
|
| 536 |
+
- `prediction_loss_only`: True
|
| 537 |
+
- `eval_on_start`: False
|
| 538 |
+
- `eval_do_concat_batches`: True
|
| 539 |
+
- `eval_use_gather_object`: False
|
| 540 |
+
- `eval_accumulation_steps`: None
|
| 541 |
+
- `include_for_metrics`: []
|
| 542 |
+
- `batch_eval_metrics`: False
|
| 543 |
+
- `save_only_model`: False
|
| 544 |
+
- `save_on_each_node`: False
|
| 545 |
+
- `enable_jit_checkpoint`: False
|
| 546 |
+
- `push_to_hub`: False
|
| 547 |
+
- `hub_private_repo`: None
|
| 548 |
+
- `hub_model_id`: None
|
| 549 |
+
- `hub_strategy`: every_save
|
| 550 |
+
- `hub_always_push`: False
|
| 551 |
+
- `hub_revision`: None
|
| 552 |
+
- `load_best_model_at_end`: False
|
| 553 |
+
- `ignore_data_skip`: False
|
| 554 |
+
- `restore_callback_states_from_checkpoint`: False
|
| 555 |
+
- `full_determinism`: False
|
| 556 |
+
- `seed`: 42
|
| 557 |
+
- `data_seed`: None
|
| 558 |
+
- `use_cpu`: False
|
| 559 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 560 |
+
- `parallelism_config`: None
|
| 561 |
+
- `dataloader_drop_last`: False
|
| 562 |
+
- `dataloader_num_workers`: 4
|
| 563 |
+
- `dataloader_pin_memory`: False
|
| 564 |
+
- `dataloader_persistent_workers`: False
|
| 565 |
+
- `dataloader_prefetch_factor`: None
|
| 566 |
+
- `remove_unused_columns`: True
|
| 567 |
+
- `label_names`: None
|
| 568 |
+
- `train_sampling_strategy`: random
|
| 569 |
+
- `length_column_name`: length
|
| 570 |
+
- `ddp_find_unused_parameters`: None
|
| 571 |
+
- `ddp_bucket_cap_mb`: None
|
| 572 |
+
- `ddp_broadcast_buffers`: False
|
| 573 |
+
- `ddp_backend`: None
|
| 574 |
+
- `ddp_timeout`: 1800
|
| 575 |
+
- `fsdp`: []
|
| 576 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 577 |
+
- `deepspeed`: None
|
| 578 |
+
- `debug`: []
|
| 579 |
+
- `skip_memory_metrics`: True
|
| 580 |
+
- `do_predict`: False
|
| 581 |
+
- `resume_from_checkpoint`: None
|
| 582 |
+
- `warmup_ratio`: None
|
| 583 |
+
- `local_rank`: -1
|
| 584 |
+
- `prompts`: None
|
| 585 |
+
- `batch_sampler`: batch_sampler
|
| 586 |
+
- `multi_dataset_batch_sampler`: proportional
|
| 587 |
+
- `router_mapping`: {}
|
| 588 |
+
- `learning_rate_mapping`: {}
|
| 589 |
+
|
| 590 |
+
</details>
|
| 591 |
+
|
| 592 |
+
### Training Logs
|
| 593 |
+
| Epoch | Step | Training Loss | Validation Loss |
|
| 594 |
+
|:------:|:----:|:-------------:|:---------------:|
|
| 595 |
+
| 0.0648 | 100 | 0.2915 | - |
|
| 596 |
+
| 0.1297 | 200 | 0.0912 | 0.0981 |
|
| 597 |
+
| 0.1945 | 300 | 0.1002 | - |
|
| 598 |
+
| 0.2593 | 400 | 0.1025 | 0.0940 |
|
| 599 |
+
| 0.3241 | 500 | 0.0851 | - |
|
| 600 |
+
| 0.3890 | 600 | 0.0707 | 0.0785 |
|
| 601 |
+
| 0.4538 | 700 | 0.0498 | - |
|
| 602 |
+
| 0.5186 | 800 | 0.0683 | 0.0609 |
|
| 603 |
+
| 0.5835 | 900 | 0.0536 | - |
|
| 604 |
+
| 0.6483 | 1000 | 0.0484 | 0.0562 |
|
| 605 |
+
| 0.7131 | 1100 | 0.0406 | - |
|
| 606 |
+
| 0.7780 | 1200 | 0.0468 | 0.0503 |
|
| 607 |
+
| 0.8428 | 1300 | 0.0392 | - |
|
| 608 |
+
| 0.9076 | 1400 | 0.0386 | 0.0486 |
|
| 609 |
+
| 0.9724 | 1500 | 0.0406 | - |
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
### Framework Versions
|
| 613 |
+
- Python: 3.12.12
|
| 614 |
+
- Sentence Transformers: 5.2.3
|
| 615 |
+
- Transformers: 5.2.0
|
| 616 |
+
- PyTorch: 2.10.0+cu128
|
| 617 |
+
- Accelerate: 1.12.0
|
| 618 |
+
- Datasets: 4.5.0
|
| 619 |
+
- Tokenizers: 0.22.2
|
| 620 |
+
|
| 621 |
+
## Citation
|
| 622 |
+
|
| 623 |
+
### BibTeX
|
| 624 |
+
|
| 625 |
+
#### Sentence Transformers
|
| 626 |
+
```bibtex
|
| 627 |
+
@inproceedings{reimers-2019-sentence-bert,
|
| 628 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
| 629 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
| 630 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
| 631 |
+
month = "11",
|
| 632 |
+
year = "2019",
|
| 633 |
+
publisher = "Association for Computational Linguistics",
|
| 634 |
+
url = "https://arxiv.org/abs/1908.10084",
|
| 635 |
+
}
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
#### MultipleNegativesRankingLoss
|
| 639 |
+
```bibtex
|
| 640 |
+
@misc{henderson2017efficient,
|
| 641 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
| 642 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
| 643 |
+
year={2017},
|
| 644 |
+
eprint={1705.00652},
|
| 645 |
+
archivePrefix={arXiv},
|
| 646 |
+
primaryClass={cs.CL}
|
| 647 |
+
}
|
| 648 |
+
```
|
| 649 |
+
|
| 650 |
+
<!--
|
| 651 |
+
## Glossary
|
| 652 |
+
|
| 653 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 654 |
+
-->
|
| 655 |
+
|
| 656 |
+
<!--
|
| 657 |
+
## Model Card Authors
|
| 658 |
+
|
| 659 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 660 |
+
-->
|
| 661 |
+
|
| 662 |
+
<!--
|
| 663 |
+
## Model Card Contact
|
| 664 |
+
|
| 665 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 666 |
+
-->
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
config.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3Model"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 151643,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 3072,
|
| 15 |
+
"layer_types": [
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention"
|
| 44 |
+
],
|
| 45 |
+
"max_position_embeddings": 32768,
|
| 46 |
+
"max_window_layers": 28,
|
| 47 |
+
"model_type": "qwen3",
|
| 48 |
+
"num_attention_heads": 16,
|
| 49 |
+
"num_hidden_layers": 28,
|
| 50 |
+
"num_key_value_heads": 8,
|
| 51 |
+
"pad_token_id": null,
|
| 52 |
+
"rms_norm_eps": 1e-06,
|
| 53 |
+
"rope_parameters": {
|
| 54 |
+
"rope_theta": 1000000,
|
| 55 |
+
"rope_type": "default"
|
| 56 |
+
},
|
| 57 |
+
"sliding_window": null,
|
| 58 |
+
"tie_word_embeddings": true,
|
| 59 |
+
"transformers_version": "5.2.0",
|
| 60 |
+
"use_cache": true,
|
| 61 |
+
"use_sliding_window": false,
|
| 62 |
+
"vocab_size": 151669
|
| 63 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"prompts": {
|
| 3 |
+
"query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
|
| 4 |
+
"document": ""
|
| 5 |
+
},
|
| 6 |
+
"default_prompt_name": null,
|
| 7 |
+
"similarity_fn_name": "cosine",
|
| 8 |
+
"model_type": "SentenceTransformer",
|
| 9 |
+
"__version__": {
|
| 10 |
+
"sentence_transformers": "5.2.3",
|
| 11 |
+
"transformers": "5.2.0",
|
| 12 |
+
"pytorch": "2.10.0+cu128"
|
| 13 |
+
}
|
| 14 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba039cd5972d14e9cfabc2b5cfb3de4c1476dfb232289aaee91d7f1e398dd2b7
|
| 3 |
+
size 1191586416
|
modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 2048,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87cc7e2f16f83b01ce44f4646e00d7fd25b316623df77145c9ea4416c05d322b
|
| 3 |
+
size 11423968
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 131072,
|
| 10 |
+
"pad_token": "<|endoftext|>",
|
| 11 |
+
"split_special_tokens": false,
|
| 12 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 13 |
+
"unk_token": null
|
| 14 |
+
}
|