Clustering Recipes: Ingredients
Seeing what sort of information we can glean from clustering recipes by ingredient, using 1129 recipes from the Good Food website. Three clusters seem to have emerged: desserts (yellow), spicier foods (red), and ‘all the rest’ (blue).
Python Source
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import os
import re
import numpy as np
from itertools import chain
# A set of the most common 10,000 words
with open("./common-words.txt", 'r') as infile:
content = infile.readlines()
common_words = [x.strip() for x in content]
# Load up the data and trim it down
path = "./data"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
ingredient_sets = []
titles = []
for file in files:
with open(os.path.join(path, file), 'r') as infile:
data = json.load(infile)
words = data['ingredients']
# Add in the title words, as they tend to mention ingredients
words.append(data['title'])
ingredient_sets.append(words)
titles.append(data['title'])
# First take out anything that's non a-z or a space, then trim and lower the string
ingredient_sets = [[re.sub('[^a-zA-Z\s]', "", x).strip().lower() for x in ingredient_set] for ingredient_set in ingredient_sets]
# Try to prune ingredient lists down to actual ingredients
# Remove units
ingredient_sets = [[re.sub('\\b(x|can(s?)|frac|tbsp(s?)|tsp(s?)|g|l|ml|ozg|goz|oz|fl|kg|c|cup(s?)|tablespoon(s?)|teaspoon(s?)|handful(s?))\\b', "", x).strip() for x in ingredient_set] for ingredient_set in ingredient_sets]
# Remove blanks
ingredient_sets = [[x for x in ingredient_set if x] for ingredient_set in ingredient_sets]
# Split into words
per_ingredient_words = [[re.split('\s+', x) for x in ingredient_set] for ingredient_set in ingredient_sets]
# Remove common words, adjectives, and combine into one array
all_ingredient_words = [
set([i for i in chain.from_iterable(ingredient_words) if i not in common_words and not i.endswith('ly') and not i.endswith('ed')]) for ingredient_words in per_ingredient_words]
# Rejoin back into space-separated strings
all_ingredient_words = [ " ".join(x) for x in all_ingredient_words ]
# Work out how important words are relative to other words in the collection
tfidf = TfidfVectorizer(stop_words="english")
X = tfidf.fit_transform(all_ingredient_words)
# Reduce the dimensionality, as we have a very sparse matrix
svd = TruncatedSVD(n_components=3)
svd_fit = svd.fit(X)
Y = svd.fit_transform(X)
# Cluster!
model = AgglomerativeClustering(n_clusters=3)
_ = model.fit(Y)
# We need to shift it into something two-dimensional that we can visualize - t-SNE is a good approach here!
pos = TSNE(n_components=2, perplexity=30, n_iter=300).fit_transform(Y)
xs, ys = pos[:, 0], pos[:, 1]
clusters = model.labels_.tolist()
from bokeh.plotting import figure, output_notebook, output_file, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.tools import BoxZoomTool, ZoomInTool, ZoomOutTool, ResetTool
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
import pandas as pd
output_notebook()
mapper = linear_cmap(field_name='label', palette=Spectral6, low=min(df.label), high=max(df.label))
source = ColumnDataSource({'x':xs,'y':ys,'label':clusters,'title':titles,'words':all_ingredient_words})
hover = HoverTool(tooltips=[
("(x,y)", "(@x, @y)"),
('title', '@title'),
('words', '@words'),
])
p = figure(title="Recipe Ingredient Similarity", x_axis_label='x', y_axis_label='y', tools=[hover],plot_width=1000)
p.add_tools(BoxZoomTool())
p.add_tools(ZoomInTool())
p.add_tools(ZoomOutTool())
p.add_tools(ResetTool())
p.circle('x','y',source=source, color=mapper, size=12)
show(p)