Natural Language Processing — Teaching Computers Language
Natural Language Processing (NLP) enables computers to understand, generate and reason about human language.
Classic Pipeline
- Tokenisation and normalisation.
- Part-of-speech tagging and parsing.
- Named-entity recognition.
- Semantic representation — embeddings.
Modern NLP
Transformer-based models such as BERT, GPT and their successors have re-defined the field and power today's chatbots, search engines and translation tools.
Code Examples: NLP Guide (5 runnable snippets)
Copy any block into a file or notebook and run it end-to-end — each example stands alone.
Example 1: Semantic search with sentence embeddings
# Example 1: Semantic search with sentence embeddings -- NLP Guide
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
docs = [
"A neural network learns a nonlinear mapping from inputs to outputs.",
"Gradient boosting trains shallow trees sequentially on residuals.",
"Transformers use self-attention to model token dependencies.",
"The central limit theorem underpins many inference procedures.",
]
query = "How do attention mechanisms work?"
E = model.encode(docs + [query], normalize_embeddings=True)
sims = E[-1] @ E[:-1].T
for doc, s in sorted(zip(docs, sims), key=lambda t: -t[1]):
print(f"{s:.3f} {doc}")
Example 2: Transformers pipeline for zero-shot classification
# Example 2: Transformers pipeline for zero-shot classification -- NLP Guide
from transformers import pipeline
classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=-1, # CPU; set 0 for first GPU
)
texts = [
"Our quarterly revenue exceeded forecasts by 12%.",
"The deployment failed after the memory leak in the worker pool.",
"This apple pie is delicious and the crust is perfectly flaky.",
]
labels = ["business", "engineering", "food", "politics"]
for text in texts:
result = classifier(text, candidate_labels=labels, multi_label=False)
pairs = list(zip(result["labels"], result["scores"]))
top = ", ".join(f"{l}={s:.2f}" for l, s in pairs[:3])
print(f"- {text[:50]}...\n {top}")
Example 3: BPE tokenizer training with tokenizers
# Example 3: BPE tokenizer training with tokenizers -- NLP Guide
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
vocab_size = 2_000,
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"],
min_frequency = 2,
)
corpus = [
"machine learning tokenizers split words into sub-units",
"byte pair encoding merges frequent pairs iteratively",
"a small vocabulary keeps the model efficient",
] * 50
tokenizer.train_from_iterator(corpus, trainer=trainer)
enc = tokenizer.encode("machine learning is efficient")
print("tokens:", enc.tokens)
print("ids :", enc.ids)
print("vocab :", tokenizer.get_vocab_size())
Example 4: TF-IDF + logistic regression text classifier
# Example 4: TF-IDF + logistic regression text classifier -- NLP Guide
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
cats = ["sci.space", "rec.sport.hockey", "talk.politics.misc"]
train = fetch_20newsgroups(subset="train", categories=cats,
remove=("headers", "footers"))
test = fetch_20newsgroups(subset="test", categories=cats,
remove=("headers", "footers"))
pipe = Pipeline([
("tfidf", TfidfVectorizer(max_features=20_000, ngram_range=(1, 2))),
("clf", LogisticRegression(max_iter=1_000, C=1.0)),
])
pipe.fit(train.data, train.target)
print(classification_report(test.target, pipe.predict(test.data),
target_names=cats, digits=3))
Example 5: Named-entity recognition with spaCy
# Example 5: Named-entity recognition with spaCy -- NLP Guide
import spacy
nlp = spacy.load("en_core_web_sm")
text = (
"In 2024, OpenAI and Microsoft announced a multi-billion-dollar "
"partnership in Redmond, Washington, to accelerate AI research."
)
doc = nlp(text)
for ent in doc.ents:
print(f"{ent.text:<30} {ent.label_:<12} ({ent.start_char}..{ent.end_char})")
people = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
print("organisations:", people)