Hacker News Data Exploration
I have the Hacker News data in the form of Parquet files. It is parsed with my hn-data-fetcher project.
I will be using DuckDB to explore the Hacker News data. I found it much faster than SQLite. Both in term of performance and ease of use as I don’t need to do indexing on the table to make it workable.
1import duckdb
2conn = duckdb.connect()
3conn.sql("CREATE VIEW hn AS SELECT * FROM read_parquet('~/hn-data-fetcher/data_parquet/*/*/*.parquet')")
4count = conn.sql("SELECT COUNT(*) FROM hn").fetchone()[0]
5print(f"Total rows: {count}")
Total rows: 41636178
Schema of the table
1conn.sql("DESCRIBE hn").show()
┌─────────────┬──────────────┬─────────┬─────────┬─────────┬─────────┐ │ column_name │ column_type │ null │ key │ default │ extra │ │ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ ├─────────────┼──────────────┼─────────┼─────────┼─────────┼─────────┤ │ id │ BIGINT │ YES │ NULL │ NULL │ NULL │ │ time │ TIMESTAMP_NS │ YES │ NULL │ NULL │ NULL │ │ by │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ title │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ url │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ text │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ score │ DOUBLE │ YES │ NULL │ NULL │ NULL │ │ type │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ parent │ DOUBLE │ YES │ NULL │ NULL │ NULL │ │ poll │ DOUBLE │ YES │ NULL │ NULL │ NULL │ │ parts │ INTEGER │ YES │ NULL │ NULL │ NULL │ │ descendants │ DOUBLE │ YES │ NULL │ NULL │ NULL │ │ dead │ INTEGER │ YES │ NULL │ NULL │ NULL │ │ deleted │ INTEGER │ YES │ NULL │ NULL │ NULL │ │ kids │ BIGINT[] │ YES │ NULL │ NULL │ NULL │ │ month │ BIGINT │ YES │ NULL │ NULL │ NULL │ │ year │ BIGINT │ YES │ NULL │ NULL │ NULL │ ├─────────────┴──────────────┴─────────┴─────────┴─────────┴─────────┤ │ 17 rows 6 columns │ └────────────────────────────────────────────────────────────────────┘
Number of posts and comments
Show code (27 lines)
1import altair as alt
2
3df = conn.sql("""
4 SELECT
5 type,
6 COUNT(*) as count
7 FROM hn
8 GROUP BY type
9""").df()
10
11bar = alt.Chart(df).mark_bar().encode(
12 x=alt.X('type:N', title='Content Type'),
13 y=alt.Y('count:Q', title='Count'),
14 text=alt.Text('count:Q', format=',')
15).properties(
16 title='Distribution of Hacker News Content Types',
17 width=600
18)
19text = bar.mark_text(
20 align='center',
21 dy=-10 # Move text up by 10 pixels
22).encode(
23 text=alt.Text('count:Q', format=',')
24)
25chart = alt.layer(bar, text)
26chart
Number of comments and story over time
Show code (31 lines)
1import pandas as pd
2
3df = conn.sql("""
4 SELECT
5 year,
6 month,
7 type,
8 COUNT(*) as count
9 FROM hn
10 WHERE type IN ('comment', 'story')
11 GROUP BY year, month, type
12 ORDER BY year, month
13""").df()
14
15df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01')
16chart = alt.Chart(df).mark_line().encode(
17 x=alt.X('date:T', title='Date'),
18 y=alt.Y('count:Q', title='Count'),
19 color=alt.Color('type:N', title='Content Type'),
20 tooltip=[
21 alt.Tooltip('date:T', title='Date', format='%B %Y'),
22 alt.Tooltip('type:N', title='Type'),
23 alt.Tooltip('count:Q', title='Count', format=',')
24 ]
25).properties(
26 title='Monthly Comments and Stories on Hacker News',
27 width=800,
28 height=400
29)
30chart
Interesting that the number of comments increase over time, while the number of stories seems to stop growing since 2012.
There is also drop of comments in starting on end of 2023.
Activity heatmap
Show code (30 lines)
1df = conn.sql("""
2 SELECT
3 EXTRACT(HOUR FROM (time - INTERVAL '6' HOUR)) as hour,
4 EXTRACT(DOW FROM (time - INTERVAL '6' HOUR)) as day_of_week,
5 COUNT(*) as count
6 FROM hn
7 GROUP BY hour, day_of_week
8 ORDER BY hour, day_of_week
9""").df()
10
11days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
12df['day_name'] = df['day_of_week'].map(lambda x: days[int(x)])
13
14chart = alt.Chart(df).mark_rect().encode(
15 x=alt.X('hour:O', title='Hour of Day (CST)'),
16 y=alt.Y('day_name:O', title='Day of Week', sort=days),
17 color=alt.Color('count:Q', title='Number of Posts',
18 scale=alt.Scale(scheme='greens')),
19 tooltip=[
20 alt.Tooltip('hour:O', title='Hour'),
21 alt.Tooltip('day_name:O', title='Day'),
22 alt.Tooltip('count:Q', title='Count', format=',')
23 ]
24).properties(
25 title='Activity Heatmap by Hour and Day of Week',
26 width=800,
27 height=300
28)
29chart
Hour is shifted to USA Central Time (-6 hours) since a lot of Hacker News users are in USA.
Looks like the most active time is afternoon on work days. Are people using Hacker News to procrastinate after lunch break?
Hiring trends
Every month, there are two threads on Hacker News that are dedicated to hiring. One is about who is hiring, and the other is about who wants to be hired.
Show code (55 lines)
1df = conn.sql("""
2 WITH hiring_posts AS (
3 SELECT
4 year,
5 month,
6 'Who is Hiring' as type,
7 SUM(descendants) as comments
8 FROM hn
9 WHERE title LIKE '%Who is hiring%'
10 AND type = 'story'
11 AND year >= 2013
12 GROUP BY year, month
13
14 UNION ALL
15
16 SELECT
17 year,
18 month,
19 'Who wants to be hired' as type,
20 SUM(descendants) as comments
21 FROM hn
22 WHERE title LIKE '%Who wants to be hired%'
23 AND type = 'story'
24 AND year >= 2013
25 GROUP BY year, month
26 )
27 SELECT
28 year,
29 month,
30 type,
31 comments,
32 year + month/12.0 as timeline
33 FROM hiring_posts
34 ORDER BY year, month, type
35""").df()
36
37chart = alt.Chart(df).mark_line(point=False).encode(
38 x=alt.X('timeline:Q',
39 title='Year',
40 axis=alt.Axis(format='d')), # Use 'd' format to show integers without decimals
41 y=alt.Y('comments:Q', title='Comments per Thread'),
42 color=alt.Color('type:N', title='Thread Type'),
43 tooltip=[
44 alt.Tooltip('year:Q', title='Year'),
45 alt.Tooltip('month:Q', title='Month'),
46 alt.Tooltip('type:N', title='Type'),
47 alt.Tooltip('comments:Q', title='Comments', format='.1f')
48 ]
49).properties(
50 title='Monthly Hiring vs Seeking Work Thread Activity',
51 width=800,
52 height=400
53)
54chart
Looking at the graph, we can see several interesting patterns:
Overall trend: The “Who is hiring” threads consistently get more comments than “Who wants to be hired” threads throughout the entire period.
Peak activity: There was a notable spike in activity around 2020-2021, particularly in the “Who is hiring” threads, possibly related to the tech industry changes during the COVID-19 pandemic.
Recent trends: After 2021, both thread types show a decline in activity, with comment counts returning to levels similar to 2018-2019. The gap between hiring and seeking work threads has also narrowed in recent years.
Seasonality: There appears to be some seasonal variation in both thread types, with regular ups and downs throughout each year, though the pattern is more pronounced in the hiring threads.
Top story of each year
Show code (21 lines)
1# Get top story by score for each year
2from IPython.display import HTML
3df = conn.sql("""
4 SELECT
5 year,
6 CONCAT('<a href="https://news.ycombinator.com/item?id=', id, '" target="_blank">', title, '</a>') as title,
7 score,
8 by,
9 FROM hn
10 WHERE type = 'story'
11 AND score = (
12 SELECT MAX(score)
13 FROM hn AS h2
14 WHERE h2.year = hn.year
15 AND h2.type = 'story'
16 )
17 ORDER BY year DESC
18""").df()
19
20pd.set_option('display.max_colwidth', None)
21display(HTML(df.to_html(escape=False, index=False)))
Top story of all time
Show code (14 lines)
1df = conn.sql("""
2 SELECT
3 CONCAT('<a href="https://news.ycombinator.com/item?id=', id, '" target="_blank">', title, '</a>') as title,
4 score,
5 by
6 FROM hn
7 WHERE type = 'story'
8 ORDER BY score DESC
9 LIMIT 10
10""").df()
11
12pd.set_option('display.max_colwidth', None)
13display(HTML(df.to_html(escape=False, index=False)))
title | score | by |
---|---|---|
Stephen Hawking has died | 6015.0 | Cogito |
A Message to Our Customers | 5771.0 | epaga |
OpenAI's board has fired Sam Altman | 5710.0 | davidbarker |
Backdoor in upstream xz/liblzma leading to SSH server compromise | 4549.0 | rkta |
CrowdStrike Update: Windows Bluescreen and Boot Loops | 4405.0 | BLKNSLVR |
Steve Jobs has passed away. | 4338.0 | patricktomas |
Bram Moolenaar has died | 4310.0 | wufocaculura |
Mechanical Watch | 4298.0 | todsacerdoti |
YouTube-dl has received a DMCA takedown from RIAA | 4240.0 | phantop |
Reflecting on one very, very strange year at Uber | 4107.0 | grey-area |
Popular Show HN
Show code (14 lines)
1df = conn.sql("""
2 SELECT
3 CONCAT('<a href="https://news.ycombinator.com/item?id=', id, '" target="_blank">', title, '</a>') as title,
4 score,
5 by
6 FROM hn
7 WHERE type = 'story'
8 AND title LIKE 'Show HN:%'
9 ORDER BY score DESC
10 LIMIT 10
11""").df()
12
13pd.set_option('display.max_colwidth', None)
14display(HTML(df.to_html(escape=False, index=False)))
title | score | by |
---|---|---|
Show HN: This up votes itself | 3531.0 | olalonde |
Show HN: I made an open-source laptop from scratch | 3237.0 | Hello9999901 |
Show HN: If YouTube had actual channels | 2741.0 | hadisafa |
Show HN: A retro video game console I've been working on in my free time | 2690.0 | pkiller |
Show HN: Redbean – Single-file distributable web server | 1998.0 | jart |
Show HN: Non.io, a Reddit-like platform Ive been working on for the last 4 years | 1943.0 | jjcm |
Show HN: I may have created a new type of puzzle | 1859.0 | drcode |
Show HN: I 3D scanned the interior of the Great Pyramid at Giza | 1752.0 | lukehollis |
Show HN: Web Design in 4 minutes | 1624.0 | bbx |
Show HN: I made a site where you practice typing by retyping entire novels | 1589.0 | Octouroboros |
Popular Ask HN
Show code (15 lines)
1df = conn.sql("""
2 SELECT
3 CONCAT('<a href="https://news.ycombinator.com/item?id=', id, '" target="_blank">', title, '</a>') as title,
4 score,
5 by
6 FROM hn
7 WHERE type = 'story'
8 AND title LIKE 'Ask HN:%'
9 ORDER BY score DESC
10 LIMIT 10
11""").df()
12
13pd.set_option('display.max_colwidth', None)
14display(HTML(df.to_html(escape=False, index=False)))
title | score | by |
---|---|---|
Ask HN: I’m an FCC Commissioner proposing regulation of IoT security updates | 3387.0 | SimingtonFCC |
Ask HN: I'm a software engineer going blind, how should I prepare? | 3270.0 | zachrip |
Ask HN: Is S3 down? | 2589.0 | iamdeedubs |
Ask HN: What are some cool but obscure data structures you know about? | 2051.0 | Uptrenda |
Ask HN: What tech job would let me get away with the least real work possible? | 2022.0 | lmueongoqx |
Ask HN: What books changed the way you think about almost everything? | 2009.0 | apitman |
Ask HN: Most interesting tech you built for just yourself? | 1901.0 | l2silver |
Ask HN: What do you do with your Raspberry Pi? | 1741.0 | xylo |
Ask HN: How to self-learn electronics? | 1681.0 | sidyapa |
Ask HN: What are the best MOOCs you've taken? | 1665.0 | csdrane |
Most submitted domain
Show code (25 lines)
1df = conn.sql("""
2 SELECT
3 REGEXP_EXTRACT(url, 'https?://([^/]+)') as domain,
4 COUNT(*) as count
5 FROM hn
6 WHERE type = 'story'
7 AND url IS NOT NULL
8 AND url != ''
9 GROUP BY domain
10 ORDER BY count DESC
11 LIMIT 20
12""").df()
13
14chart = alt.Chart(df).mark_bar().encode(
15 x=alt.X('count:Q', title='Number of submissions'),
16 y=alt.Y('domain:N', sort='-x', title='Domain'),
17 tooltip=['domain', 'count']
18).properties(
19 title='Most Submitted Domains on HN',
20 width=800,
21 height=400
22)
23
24chart
Most upvoted domain
Show code (27 lines)
1df = conn.sql("""
2 SELECT
3 REGEXP_EXTRACT(url, 'https?://([^/]+)') as domain,
4 COUNT(*) as count,
5 SUM(score) as total_score,
6 AVG(score) as avg_score
7 FROM hn
8 WHERE type = 'story'
9 AND url IS NOT NULL
10 AND url != ''
11 GROUP BY domain
12 ORDER BY total_score DESC
13 LIMIT 20
14""").df()
15
16chart = alt.Chart(df).mark_bar().encode(
17 x=alt.X('total_score:Q', title='Total score'),
18 y=alt.Y('domain:N', sort='-x', title='Domain'),
19 tooltip=['domain', 'total_score', 'count', 'avg_score']
20).properties(
21 title='Most Upvoted Domains on HN',
22 width=800,
23 height=400
24)
25
26chart
High quality submission
But then, what is usually sources that get most upvoted often vs people just spamming it?
Show code (30 lines)
1df = conn.sql("""
2 SELECT
3 REGEXP_EXTRACT(url, 'https?://([^/]+)') as domain,
4 COUNT(*) as count,
5 SUM(score) as total_score,
6 AVG(score) as avg_score,
7 total_score / COUNT(*) as score_per_submission
8 FROM hn
9 WHERE type = 'story'
10 AND url IS NOT NULL
11 AND url != ''
12 AND score > 0
13 GROUP BY domain
14 HAVING count >= 10
15 ORDER BY score_per_submission DESC
16 LIMIT 20
17""").df()
18
19chart = alt.Chart(df).mark_bar().encode(
20 x=alt.X('score_per_submission:Q', title='Average Score per Submission'),
21 y=alt.Y('domain:N', sort='-x', title='Domain'),
22 tooltip=['domain', 'total_score', 'count', 'avg_score', 'score_per_submission']
23).properties(
24 title='Domains with Highest Average Score per Submission (min 10 submissions)',
25 width=800,
26 height=400
27)
28
29chart
1df['domain'] = df['domain'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
2display(HTML(df.to_html(escape=False, index=False)))
domain | count | total_score | avg_score | score_per_submission |
---|---|---|---|---|
https://www.susanjfowler.com | 10 | 4781.0 | 478.100000 | 478.100000 |
https://ciechanow.ski | 68 | 28649.0 | 421.308824 | 421.308824 |
https://intuitiveexplanations.com | 12 | 4957.0 | 413.083333 | 413.083333 |
https://dkb.io | 12 | 4532.0 | 377.666667 | 377.666667 |
https://www.phind.com | 13 | 4832.0 | 371.692308 | 371.692308 |
https://learningmusic.ableton.com | 12 | 4422.0 | 368.500000 | 368.500000 |
https://bugs.xdavidhu.me | 11 | 3901.0 | 354.636364 | 354.636364 |
http://blog.rust-lang.org | 34 | 11646.0 | 342.529412 | 342.529412 |
http://blog.chriszacharias.com | 11 | 3727.0 | 338.818182 | 338.818182 |
https://oimo.io | 25 | 7867.0 | 314.680000 | 314.680000 |
http://immersivemath.com | 12 | 3630.0 | 302.500000 | 302.500000 |
https://www.moore.org | 11 | 3019.0 | 274.454545 | 274.454545 |
https://tinyprojects.dev | 20 | 5307.0 | 265.350000 | 265.350000 |
https://tauri.studio | 10 | 2642.0 | 264.200000 | 264.200000 |
https://casparwre.de | 11 | 2862.0 | 260.181818 | 260.181818 |
https://mattferraro.dev | 11 | 2859.0 | 259.909091 | 259.909091 |
https://www.sofuckingagile.com | 10 | 2591.0 | 259.100000 | 259.100000 |
https://asahilinux.org | 28 | 7010.0 | 250.357143 | 250.357143 |
https://omar.website | 10 | 2493.0 | 249.300000 | 249.300000 |
https://www.tablix.org | 10 | 2492.0 | 249.200000 | 249.200000 |
🗒️ Download (notebook.ipynb)