]> git.ipfire.org Git - location/libloc.git/commitdiff
importer: Make the export 200x faster
authorMichael Tremer <michael.tremer@ipfire.org>
Mon, 18 Mar 2024 16:56:50 +0000 (16:56 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Mon, 18 Mar 2024 16:56:50 +0000 (16:56 +0000)
The export became a huge problem here as it was very slow. One large
query produced one very large output which was difficult to test and
validate as the total export could easily take 17-20 hrs.

This patch fixes this so that the export now takes around ~5min. Sadly
we are still not using more than one processor core, because PostgreSQL
thinks its faster with just the one, but we trade a lot of CPU time for
a little bit of extra memory instead.

We create a new temporary table that holds all networks that we are
interested in. We then run a couple of broken down queries that perform
one thing instead of multiple at the same time.

First, we add all AS numbers, then all countries which are the longest
queries taking now around 20-30s. Then we apply any data from the
external feeds and our own overrides.

The final export requires about 1.7 GiB of memory because we don't
perform any deduplication in the database any more. That has been a lot
better implemented in our own code and I don't think that < 2 GiB is a
reason for concern.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/scripts/location-importer.in

index afd1fd05d5f78b6d4fcc58fc9c9b463f4929cebb..80290ffe953502657614e67a280c5cad05ff3c0a 100644 (file)
@@ -369,296 +369,416 @@ class CLI(object):
                # Add all networks
                log.info("Writing networks...")
 
-               # Select all known networks
-               rows = self.db.query("""
-                       WITH known_networks AS (
-                               SELECT network FROM announcements
-                               UNION
-                               SELECT network FROM networks
-                               UNION
-                               SELECT network FROM network_feeds
-                               UNION
-                               SELECT network FROM network_overrides
-                               UNION
-                               SELECT network FROM geofeed_networks
-                       ),
-
-                       ordered_networks AS (
-                               SELECT
-                                       known_networks.network AS network,
-                                       announcements.autnum AS autnum,
-                                       networks.country AS country,
+               # Create a new temporary table where we collect
+               # the networks that we are interested in
+               self.db.execute("""
+                       CREATE TEMPORARY TABLE
+                               n
+                       (
+                               network               inet NOT NULL,
+                               autnum                integer,
+                               country               text,
+                               is_anonymous_proxy    boolean,
+                               is_satellite_provider boolean,
+                               is_anycast            boolean,
+                               is_drop               boolean
+                       )
+                       WITH (FILLFACTOR = 50)
+               """)
 
-                                       -- Must be part of returned values for ORDER BY clause
-                                       masklen(announcements.network) AS sort_a,
-                                       masklen(networks.network) AS sort_b
-                               FROM
-                                       known_networks
-                               LEFT JOIN
-                                       announcements ON known_networks.network <<= announcements.network
-                               LEFT JOIN
-                                       networks ON known_networks.network <<= networks.network
-                               ORDER BY
-                                       known_networks.network,
-                                       sort_a DESC,
-                                       sort_b DESC
+               # Add all known networks
+               self.db.execute("""
+                       INSERT INTO
+                               n
+                       (
+                               network
                        )
 
-                       -- Return a list of those networks enriched with all
-                       -- other information that we store in the database
                        SELECT
-                               DISTINCT ON (network)
-                               network,
-                               autnum,
+                               network
+                       FROM
+                               announcements
 
-                               -- Country
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       geofeed_networks.country AS country
-                                               FROM
-                                                       network_geofeeds
+                       UNION
 
-                                               -- Join the data from the geofeeds
-                                               LEFT JOIN
-                                                       geofeeds ON network_geofeeds.url = geofeeds.url
-                                               LEFT JOIN
-                                                       geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
+                       SELECT
+                               network
+                       FROM
+                               networks
 
-                                               -- Check whether we have a geofeed for this network
-                                               WHERE
-                                                       networks.network <<= network_geofeeds.network
-                                               AND
-                                                       networks.network <<= geofeed_networks.network
+                       UNION
 
-                                               -- Filter for the best result
-                                               ORDER BY
-                                                       masklen(geofeed_networks.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       networks.country
-                               ) AS country,
+                       SELECT
+                               network
+                       FROM
+                               network_feeds
 
-                               -- Flags
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_anonymous_proxy,
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_satellite_provider,
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
+                       UNION
+
+                       SELECT
+                               network
+                       FROM
+                               network_overrides
+
+                       UNION
+
+                       SELECT
+                               network
+                       FROM
+                               geofeed_networks
+               """)
+
+               # Create an index to search through networks faster
+               self.db.execute("""
+                       CREATE INDEX
+                               n_search
+                       ON
+                               n
+                       USING
+                               SPGIST(network)
+               """)
+
+               # Analyze n
+               self.db.execute("ANALYZE n")
+
+               # Apply the AS number to all networks
+               self.db.execute("""
+                       -- Join all networks together with their most specific announcements
+                       WITH announcements AS (
+                               SELECT
+                                       n.network,
+                                       announcements.autnum,
+
+                                       -- Sort all merges and number them so
+                                       -- that we can later select the best one
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(announcements.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       announcements
+                               ON
+                                       announcements.network >>= n.network
+                       )
+
+                       -- Store the result
+                       UPDATE
+                               n
+                       SET
+                               autnum = announcements.autnum
+                       FROM
+                               announcements
+                       WHERE
+                               announcements.network = n.network
+                       AND
+                               announcements.row = 1
+                       """,
+               )
+
+               # Apply country information
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       networks.country,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_anycast,
-                               COALESCE(
+                                                       masklen(networks.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       networks
+                               ON
+                                       networks.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country = networks.country
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Add all country information from Geofeeds
+               self.db.execute("""
+                       WITH geofeeds AS (
+                               SELECT
+                                       DISTINCT ON (geofeed_networks.network)
+                                       geofeed_networks.network,
+                                       geofeed_networks.country
+                               FROM
+                                       geofeeds
+                               JOIN
+                                       network_geofeeds networks
+                               ON
+                                       geofeeds.url = networks.url
+                               JOIN
+                                       geofeed_networks
+                               ON
+                                       geofeeds.id = geofeed_networks.geofeed_id
+                               AND
+                                       networks.network >>= geofeed_networks.network
+                       ),
+
+                       networks AS (
+                               SELECT
+                                       n.network,
+                                       geofeeds.country,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(geofeeds.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       geofeeds
+                               ON
+                                       geofeeds.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country = networks.country
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Apply country and flags from feeds
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       network_feeds.country,
+
+                                       -- Flags
+                                       network_feeds.is_anonymous_proxy,
+                                       network_feeds.is_satellite_provider,
+                                       network_feeds.is_anycast,
+                                       network_feeds.is_drop,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(network_feeds.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       network_feeds
+                               ON
+                                       network_feeds.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Apply country and flags from AS feeds
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       autnum_feeds.country,
+
+                                       -- Flags
+                                       autnum_feeds.is_anonymous_proxy,
+                                       autnum_feeds.is_satellite_provider,
+                                       autnum_feeds.is_anycast,
+                                       autnum_feeds.is_drop
+                               FROM
+                                       n
+                               JOIN
+                                       autnum_feeds
+                               ON
+                                       autnum_feeds.number = n.autnum
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+               """)
+
+               # Apply network overrides
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       network_overrides.country,
+
+                                       -- Flags
+                                       network_overrides.is_anonymous_proxy,
+                                       network_overrides.is_satellite_provider,
+                                       network_overrides.is_anycast,
+                                       network_overrides.is_drop,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_drop
+                                                       masklen(network_overrides.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       network_overrides
+                               ON
+                                       network_overrides.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
                        FROM
-                               ordered_networks networks
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+               """)
+
+               # Apply AS overrides
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       autnum_overrides.country,
+
+                                       -- Flags
+                                       autnum_overrides.is_anonymous_proxy,
+                                       autnum_overrides.is_satellite_provider,
+                                       autnum_overrides.is_anycast,
+                                       autnum_overrides.is_drop
+                               FROM
+                                       n
+                               JOIN
+                                       autnum_overrides
+                               ON
+                                       autnum_overrides.number = n.autnum
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+               """)
+
+               # Here we could remove some networks that we no longer need, but since we
+               # already have implemented our deduplication/merge algorithm this would not
+               # be necessary.
+
+               # Export the entire temporary table
+               rows = self.db.query("""
+                       SELECT
+                               *
+                       FROM
+                               n
+                       ORDER BY
+                               network
                """)
 
                for row in rows:
@@ -687,7 +807,17 @@ class CLI(object):
 
                # Add all countries
                log.info("Writing countries...")
-               rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
+
+               # Select all countries
+               rows = self.db.query("""
+                       SELECT
+                               *
+                       FROM
+                               countries
+                       ORDER BY
+                               country_code
+                       """,
+               )
 
                for row in rows:
                        c = writer.add_country(row.country_code)