png

In this section, we’re going to work on the other file from the AO3 data dump, the tags.

Loading File

# Load Python library
import pandas as pd

# Load file
path="/home/pi/Downloads/tags-20210226.csv"
chunker = pd.read_csv(path, chunksize=10000)
tags = pd.concat(chunker, ignore_index=True)

# Preview file
tags

	id	type	name	canonical	cached_count	merger_id
0	1	Media	TV Shows	True	910	NaN
1	2	Media	Movies	True	1164	NaN
2	3	Media	Books & Literature	True	134	NaN
3	4	Media	Cartoons & Comics & Graphic Novels	True	166	NaN
4	5	Media	Anime & Manga	True	501	NaN
...	...	...	...	...	...	...
14467133	55395603	Freeform	Redacted	False	0	NaN
14467134	55395606	Freeform	Redacted	False	0	NaN
14467135	55395609	Freeform	Redacted	False	0	NaN
14467136	55395612	Freeform	Redacted	False	0	NaN
14467137	55395615	Freeform	Redacted	False	0	NaN

14467138 rows × 6 columns

Exploring Data

# Find tag types
tags.type.unique()

array(['Media', 'Rating', 'ArchiveWarning', 'Category', 'Character',
       'Fandom', 'Relationship', 'Freeform', 'UnsortedTag'], dtype=object)

# Find subcategories of Media type tag
tags[tags['type'] == 'Media'].name.unique()

array(['TV Shows', 'Movies', 'Books & Literature',
       'Cartoons & Comics & Graphic Novels', 'Anime & Manga',
       'Music & Bands', 'Celebrities & Real People', 'Other Media',
       'Video Games', 'No Media', 'Uncategorized Fandoms', 'Theater'],
      dtype=object)

Cleaning and Manipulating Data

Visualize the Media tags according to its cached_count

# Visualize the Media tags according to its cached_count
# Prepare data set
# Select columns

subset = tags[['type', 'name', 'cached_count']].copy()
subset

	type	name	cached_count
0	Media	TV Shows	910
1	Media	Movies	1164
2	Media	Books & Literature	134
3	Media	Cartoons & Comics & Graphic Novels	166
4	Media	Anime & Manga	501
...	...	...	...
14467133	Freeform	Redacted	0
14467134	Freeform	Redacted	0
14467135	Freeform	Redacted	0
14467136	Freeform	Redacted	0
14467137	Freeform	Redacted	0

14467138 rows × 3 columns

# Select all names that type is media

media = subset[subset['type'] == 'Media'].drop('type',axis=1)
media

	name	cached_count
0	TV Shows	910
1	Movies	1164
2	Books & Literature	134
3	Cartoons & Comics & Graphic Novels	166
4	Anime & Manga	501
5	Music & Bands	19
6	Celebrities & Real People	33
7	Other Media	11
388	Video Games	448
4510	No Media	0
8150	Uncategorized Fandoms	1
23433	Theater	104

Matplotlib Pie Chart

# Import libraries
# Top line is Jupyter Notebook specific

%matplotlib inline

import matplotlib.pyplot as plt

# Visualization pie chart
# with matplotlib

explode = [0.1] * len(media.name)

fig, axes = plt.subplots()
ax = axes.pie(media.cached_count, labels=media.name, explode=explode, autopct='%1.1f%%')

png

Modifying data set for a better graph

Sort the data, find quantile, group lower values together

# Sort the dataframe in descending order
# Use inplace=True to edit the existing dataframe

media.sort_values(by='cached_count',ascending=False, inplace=True, ignore_index=True)
media

	name	cached_count
0	Movies	1164
1	TV Shows	910
2	Anime & Manga	501
3	Video Games	448
4	Cartoons & Comics & Graphic Novels	166
5	Books & Literature	134
6	Theater	104
7	Celebrities & Real People	33
8	Music & Bands	19
9	Other Media	11
10	Uncategorized Fandoms	1
11	No Media	0

# Find 50th quantile values

qt_50 = media.cached_count.quantile(.50)
qt_50

119.0

# Group values below 50th quantile
# Find what rows to drop
drop_rows = media[media['cached_count'] <= qt_50].copy()
drop_rows

	name	cached_count
6	Theater	104
7	Celebrities & Real People	33
8	Music & Bands	19
9	Other Media	11
10	Uncategorized Fandoms	1
11	No Media	0

# Find the sum of drop_rows values

add_value = media[media['cached_count'] <= qt_50].cached_count.sum()
add_value

# Edit media dataframe
# Add a new row Other
# Add new value as cached_count

media2 = media[media['cached_count'] > qt_50].append({'name':'Other','cached_count':add_value}, ignore_index=True)
media2

	name	cached_count
0	Movies	1164
1	TV Shows	910
2	Anime & Manga	501
3	Video Games	448
4	Cartoons & Comics & Graphic Novels	166
5	Books & Literature	134
6	Other	168

# pie chart with new media2 dataframe

explode = [0.1] * len(media2.name)

fig,axes = plt.subplots()

ax = axes.pie(media2.cached_count, labels=media2.name, explode=explode, autopct='%1.1f%%')

png

Matplotlib Table

# Test pandas.plotting.table()

fig,ax = plt.subplots()
ax = pd.plotting.table(ax,data=drop_rows)

png

# Edit drop_rows dataframe
# Add a new column of percentage
drop_rows['Percentage'] = (drop_rows.cached_count / media.cached_count.sum() * 100).round(2)
drop_rows

	name	cached_count	Percentage
6	Theater	104	2.98
7	Celebrities & Real People	33	0.95
8	Music & Bands	19	0.54
9	Other Media	11	0.32
10	Uncategorized Fandoms	1	0.03
11	No Media	0	0.00

# Drop cached_count
# Change column name
drop_rows.drop('cached_count', axis=1, inplace=True)
drop_rows.columns = ['Other', 'Percentage']
drop_rows

	Other	Percentage
6	Theater	2.98
7	Celebrities & Real People	0.95
8	Music & Bands	0.54
9	Other Media	0.32
10	Uncategorized Fandoms	0.03
11	No Media	0.00

Plotting Pie Chart and Table

# Plot pie chart
# Plot table 
explode = [0.1] * len(media2.name)

fig,axes = plt.subplots(figsize=(10, 8))

ax = axes.pie(media2.cached_count, labels=media2.name, explode=explode, autopct='%1.1f%%')
table = pd.plotting.table(axes,data=drop_rows, cellLoc='center')

# Aaethetics
table.scale(1,1.5) # add padding around cell text
ax = plt.title('AO3 Tags Breakdown By Media Type \n 2008-2021')

png