Natural Language Processing — Teaching Computers Language

Natural Language Processing (NLP) enables computers to understand, generate and reason about human language.

Classic Pipeline

  1. Tokenisation and normalisation.
  2. Part-of-speech tagging and parsing.
  3. Named-entity recognition.
  4. Semantic representation — embeddings.

Modern NLP

Transformer-based models such as BERT, GPT and their successors have re-defined the field and power today's chatbots, search engines and translation tools.

Code Examples: NLP Guide (5 runnable snippets)

Copy any block into a file or notebook and run it end-to-end — each example stands alone.

Example 1: Semantic search with sentence embeddings

# Example 1: Semantic search with sentence embeddings -- NLP Guide
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
docs  = [
    "A neural network learns a nonlinear mapping from inputs to outputs.",
    "Gradient boosting trains shallow trees sequentially on residuals.",
    "Transformers use self-attention to model token dependencies.",
    "The central limit theorem underpins many inference procedures.",
]
query = "How do attention mechanisms work?"

E    = model.encode(docs + [query], normalize_embeddings=True)
sims = E[-1] @ E[:-1].T

for doc, s in sorted(zip(docs, sims), key=lambda t: -t[1]):
    print(f"{s:.3f}  {doc}")

Example 2: Transformers pipeline for zero-shot classification

# Example 2: Transformers pipeline for zero-shot classification -- NLP Guide
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=-1,                              # CPU; set 0 for first GPU
)

texts = [
    "Our quarterly revenue exceeded forecasts by 12%.",
    "The deployment failed after the memory leak in the worker pool.",
    "This apple pie is delicious and the crust is perfectly flaky.",
]
labels = ["business", "engineering", "food", "politics"]

for text in texts:
    result = classifier(text, candidate_labels=labels, multi_label=False)
    pairs  = list(zip(result["labels"], result["scores"]))
    top    = ", ".join(f"{l}={s:.2f}" for l, s in pairs[:3])
    print(f"- {text[:50]}...\n    {top}")

Example 3: BPE tokenizer training with tokenizers

# Example 3: BPE tokenizer training with tokenizers -- NLP Guide
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer                  = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer    = Whitespace()
trainer = BpeTrainer(
    vocab_size      = 2_000,
    special_tokens  = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"],
    min_frequency   = 2,
)

corpus = [
    "machine learning tokenizers split words into sub-units",
    "byte pair encoding merges frequent pairs iteratively",
    "a small vocabulary keeps the model efficient",
] * 50
tokenizer.train_from_iterator(corpus, trainer=trainer)

enc = tokenizer.encode("machine learning is efficient")
print("tokens:", enc.tokens)
print("ids   :", enc.ids)
print("vocab :", tokenizer.get_vocab_size())

Example 4: TF-IDF + logistic regression text classifier

# Example 4: TF-IDF + logistic regression text classifier -- NLP Guide
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

cats  = ["sci.space", "rec.sport.hockey", "talk.politics.misc"]
train = fetch_20newsgroups(subset="train", categories=cats,
                           remove=("headers", "footers"))
test  = fetch_20newsgroups(subset="test",  categories=cats,
                           remove=("headers", "footers"))

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=20_000, ngram_range=(1, 2))),
    ("clf",   LogisticRegression(max_iter=1_000, C=1.0)),
])
pipe.fit(train.data, train.target)
print(classification_report(test.target, pipe.predict(test.data),
                            target_names=cats, digits=3))

Example 5: Named-entity recognition with spaCy

# Example 5: Named-entity recognition with spaCy -- NLP Guide
import spacy

nlp  = spacy.load("en_core_web_sm")
text = (
    "In 2024, OpenAI and Microsoft announced a multi-billion-dollar "
    "partnership in Redmond, Washington, to accelerate AI research."
)
doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text:<30} {ent.label_:<12} ({ent.start_char}..{ent.end_char})")

people = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
print("organisations:", people)