Articles → NLP → Predict If The Text Is Spam Or Not In NLP
Predict If The Text Is Spam Or Not In NLP
NLP Pipeline
Text
texts = [
"Win cash now!!! Click this link",
"Congratulations, you have won a prize",
"Hey, are we meeting today?",
"Please call me when you are free",
"Limited offer! Buy now and get 50% off",
"Let's have lunch tomorrow"
]
labels = ["spam","spam","ham","ham","spam","ham"]
Tokens
# tokenizer
def spacy_tokenizer(text):
doc = nlp(text)
tokens = []
for token in doc:
if not token.is_stop:
tokens.append(token.lemma_)
return tokens
TF-IDF Fit Transformation
# Initialize the Vectorizer
#Learns the vocabulary from the input data (fit). (the goal is to build a dictionary of all unique words and calculate the importance of those words)
#Transforms the text data into a feature matrix (transform).
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, lowercase=True)
X = vectorizer.fit_transform(texts)
Model
# Initialize the Model
model = LinearSVC()
model.fit(X, labels)
Prediction
# predict
new_message = ["Congratulations! You win free cash now"]
X_new = vectorizer.transform(new_message)
prediction = model.predict(X_new)
print("Prediction:", prediction[0])
Output
Complete Code
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import spacy
# Load the English model
nlp = spacy.load("en_core_web_sm")
texts = [
"Win cash now!!! Click this link",
"Congratulations, you have won a prize",
"Hey, are we meeting today?",
"Please call me when you are free",
"Limited offer! Buy now and get 50% off",
"Let's have lunch tomorrow"
]
labels = ["spam","spam","ham","ham","spam","ham"]
# tokenizer
def spacy_tokenizer(text):
doc = nlp(text)
tokens = []
for token in doc:
if not token.is_stop:
tokens.append(token.lemma_)
return tokens
# Initialize the Vectorizer
#Learns the vocabulary from the input data (fit). (the goal is to build a dictionary of all unique words and calculate the importance of those words)
#Transforms the text data into a feature matrix (transform).
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, lowercase=True)
X = vectorizer.fit_transform(texts)
# Initialize the Model
model = LinearSVC()
model.fit(X, labels)
# predict
new_message = ["Congratulations! You win free cash now"]
X_new = vectorizer.transform(new_message)
prediction = model.predict(X_new)
print("Prediction:", prediction[0])
| Posted By - | Karan Gupta |
| |
| Posted On - | Wednesday, January 7, 2026 |