From: Ralph Ronnquist Date: Mon, 17 Feb 2025 23:37:52 +0000 (+1100) Subject: Initial implementation X-Git-Tag: 0.1~6 X-Git-Url: https://git.rrq.au/?a=commitdiff_plain;h=bbb34fe014bde47614800110bb92beb8d1110b81;p=rrq%2Fblockdomains.git Initial implementation --- bbb34fe014bde47614800110bb92beb8d1110b81 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..930d43c --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +# Distribution Makefile for blockdomains + +# Generated files +SBINFILES = blockdomains +BINFILES = blockdomainsctl +MANFILES = blockdomains.5 blockdomains.8 blockdomainsctl.8 +CFGDIRS = blocked/ acl/ + +default: $(SBINFILES) $(MANFILES) + +.PHONY: install clean + +# Building targets + +$(MANFILES): %: doc/%.adoc + asciidoctor -bmanpage -o $@ $< + +README.html: README.adoc + asciidoctor -bhtml $< + +blockdomains: src/blockdomains.c src/cache.c src/database.c + gcc -g -Wall -o $@ $^ -lnetfilter_queue + +# Installation + +SBINDIR = $(DESTDIR)/usr/sbin +BINDIR = $(DESTDIR)/usr/bin +CFGTOP = $(DESTDIR)/etc/blockdomains +MAN5DIR = $(DESTDIR)/usr/share/man/man5 +MAN8DIR = $(DESTDIR)/usr/share/man/man8 +SYSVINIT = $(DESTDIR)/etc/init.d/blockdomains + +install: $(addprefix $(SBINDIR)/,$(SBINFILES)) +install: $(addprefix $(BINDIR)/,$(BINFILES)) +install: $(addprefix $(MAN5DIR)/,$(filter %.5,$(MANFILES))) +install: $(addprefix $(MAN8DIR)/,$(filter %.8,$(MANFILES))) +install: $(addprefix $(CFGTOP)/,$(CFGDIRS)) +install: $(SYSVINIT) + +$(SYSVINIT): init/blockdomains + install -D $< $@ + +$(CFGTOP)/%/: + mkdir -p $@ + +$(SBINDIR)/% $(BINDIR)/% $(MAN5DIR)/% $(MAN8DIR)/%: % + install -D $< $@ + +# Cleaning up + +clean: + rm -f $(SBINFILES) $(MANFILES) diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..4b0a02e --- /dev/null +++ b/README.adoc @@ -0,0 +1,72 @@ += Blocklist based domain name filtering + +The `blockdomains` utility is a blacklist based network traffic filter +for `iptables` via `libnetfilter-queue`. It applies to HTTP and SSL +traffic for recognizing and dropping packets that are directed to +blacklisted domain names. + +== Dendencies + +Operationally `blockdomains` depends on the `libnetfilter-queue-dev` and +`iptables` packages, and for building, you'll also need a C build +environment including `make`. + +The blacklist format is that of squidblacklist.org, which you'll need +to acquire separately. + +== Build and Install + +`blockdomains` is distributed in a tar file, which should be unpacked at +its future residence; e.g., as /usr/local/src/blockdomains-1.0.0. Then +`cd` into that directory and type: + +> `# make` + +This will build the binary filter, and install the control script as +`/usr/local/sbin/blockdomains.sh`. Edit the Makefile to install +elsewhere. + +== Setup and Confguration + +The utility has a configuration directory `acl` that is intended to +hold all available access control lists, and a directory `blocked` +that should be set up with links to the access control list files +to use. For example: + +> `# ( cd blocked && ln -s ../acl/youtube-google-videos.acl )` + +That command will set up `youtube-google-videos.acl` to be an included +blacklist. Do the opposite to remove; for example: + +> `# rm blocked/youtube-google-videos.acl` + +== Running + +The `blockdomains` is started with the following command: + +> `# blockdomains.sh start` + +With the `start` argument, the script adds appropriate `iptables` +rules to use direct certain traffic to net-filter queue 99, and it +starts a background process fot that filtering. + +> `# blockdomains.sh reload` + +With the `reload` argument, the control script stops and restarts the +filter without changing `iptables` rules. + +> `# blockdomains.sh stop` + +With the `stop` argument, the control script removes the `iptables` +rules and terminates the filtering process. + +== Technical Detail + +The filtering uses the given lists of domain names for rejecting +packets. It recognizes HTTP message headers and SSL certificate +requests, from where it picks out the targeted domain name. If that +name is blacklised or in a blacklisted domain, then the packet is +rejected. + +The filtering also uses a fixed size decision cache, so that +subsequent decisions for the same target can be made quickly. diff --git a/blockdomainsctl b/blockdomainsctl new file mode 100755 index 0000000..c91ad8a --- /dev/null +++ b/blockdomainsctl @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Control script for manual use of blockdomains. + +do_start() { + iptables -I OUTPUT -p tcp -j NFQUEUE --queue-num 99 + blockdomains /etc/blockdomains/blocked/*.acl & +} + +do_stop() { + iptables -D OUTPUT -p tcp -j NFQUEUE --queue-num 99 + pkill blockdomains +} + +case "$1" in + start) do_start ;; + reload) do_stop ; do_start ;; + stop) do_stop ;; + *) echo "Use start, stop or reload" >&2 ;; +esac diff --git a/doc/blockdomains.5.adoc b/doc/blockdomains.5.adoc new file mode 100644 index 0000000..9b3c72e --- /dev/null +++ b/doc/blockdomains.5.adoc @@ -0,0 +1,44 @@ += blockdomains(5) + +== NAME +blockdomains - block list file format + +== SYNOPSIS + +/etc/blockdomains/acl/blocklist.acl + +ln -s ../acl/blocklist.acl /etc/blockdomains/blocked/ + +== DESCRIPTION + +**blockdomains** uses one or more block list files which contain +declaratios of the domains to block, one domain per line that starts +with any number of whitespace characters followed by a period (".") +before the domain to block. The blocking applies to the domain and all +its sub domains. + +==== +Anything not starting with a period (".") is a comment and leading +whitespace is ignored. Block list domains start with optional +whitespace and a period, followed by the domain name to block, +optionally followed by a whitespace and a comment. Like the following: +==== + +.Example of block list +---- + .bad.domain.com -- domain name up to whitespace is blocked + +Blank lines are fine too; they treated as comments. The block list +domains don't need to be column aligned. So, here is another: + +.another.domain.to.block +End of block list example. +---- + +== SEE ALSO + +blockdomains(8) + +== AUTHOR + +Ralph Ronnquist diff --git a/doc/blockdomains.8.adoc b/doc/blockdomains.8.adoc new file mode 100644 index 0000000..408546a --- /dev/null +++ b/doc/blockdomains.8.adoc @@ -0,0 +1,25 @@ += blockdomains(8) + +== NAME + +blockdomains - Firewall agent blocking selected HTTP and HTTPS +connections + +== SYNOPSIS + +blockdomains __blocklistfile__+ + +== DESCRIPTION + +The **blockdomains** utility is a blacklist based network traffic +filter for iptables via libnetfilter-queue. It applies to HTTP and SSL +traffic for recognizing and dropping packets that are directed to +blacklisted domain names. + +== SEE ALSO + +blockdomains(5) + +== AUTHOR + +Ralph Ronnquist diff --git a/doc/blockdomainsctl.8.adoc b/doc/blockdomainsctl.8.adoc new file mode 100644 index 0000000..7810b95 --- /dev/null +++ b/doc/blockdomainsctl.8.adoc @@ -0,0 +1,21 @@ += blockdomainsctl(8) + +== NAME + +blockdomainsctl - utility for manual start/reload/stop of blockdomains + +== SYNOPSIS + +blockdomainsctl __action__ + +== DESCRIPTION + +blockdomainsctl is a utility for manual operation of blockdomains. + +== SEE ALSO + +blockdomains(8) + +== AUTHOR + +Ralph Ronnquist diff --git a/doc/ssl.txt b/doc/ssl.txt new file mode 100644 index 0000000..9dbe988 --- /dev/null +++ b/doc/ssl.txt @@ -0,0 +1,42 @@ +const unsigned char good_data_2[] = { + // TLS record + 0x16, // [0] Content Type: Handshake + 0x03, 0x01, // [1,2] Version: TLS 1.0 + 0x00, 0x6c, // [3,4] Length (use for bounds checking) + // Handshake + 0x01, // [5] Handshake Type: Client Hello + 0x00, 0x00, 0x68, // [6,7,8] Length (use for bounds checking) + 0x03, 0x03, // [9,10] Version: TLS 1.2 + // [11,,42] Random (32 bytes fixed length) + 0xb6, 0xb2, 0x6a, 0xfb, 0x55, 0x5e, 0x03, 0xd5, + 0x65, 0xa3, 0x6a, 0xf0, 0x5e, 0xa5, 0x43, 0x02, + 0x93, 0xb9, 0x59, 0xa7, 0x54, 0xc3, 0xdd, 0x78, + 0x57, 0x58, 0x34, 0xc5, 0x82, 0xfd, 0x53, 0xd1, + 0x00, // [43] Session ID Length (skip past this much) + 0x00, 0x04, // [44,45] Cipher Suites Length (skip past this much) + 0x00, 0x01, // NULL-MD5 + 0x00, 0xff, // RENEGOTIATION INFO SCSV + 0x01, // Compression Methods Length (skip past this much) + 0x00, // NULL + 0x00, 0x3b, // Extensions Length (use for bounds checking) + // Extension + 0x00, 0x00, // Extension Type: Server Name (check extension type) + 0x00, 0x0e, // Length (use for bounds checking) + 0x00, 0x0c, // Server Name Indication Length + 0x00, // Server Name Type: host_name (check server name type) + 0x00, 0x09, // Length (length of your data) + // "localhost" (data your after) + 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x68, 0x6f, 0x73, 0x74, + // Extension + 0x00, 0x0d, // Extension Type: Signature Algorithms (check extension type) + 0x00, 0x20, // Length (skip past since this is the wrong extension) + // Data + 0x00, 0x1e, 0x06, 0x01, 0x06, 0x02, 0x06, 0x03, + 0x05, 0x01, 0x05, 0x02, 0x05, 0x03, 0x04, 0x01, + 0x04, 0x02, 0x04, 0x03, 0x03, 0x01, 0x03, 0x02, + 0x03, 0x03, 0x02, 0x01, 0x02, 0x02, 0x02, 0x03, + // Extension + 0x00, 0x0f, // Extension Type: Heart Beat (check extension type) + 0x00, 0x01, // Length (skip past since this is the wrong extension) + 0x01 // Mode: Peer allows to send requests +}; diff --git a/init/blockdomains b/init/blockdomains new file mode 100755 index 0000000..82c81ef --- /dev/null +++ b/init/blockdomains @@ -0,0 +1,24 @@ +#!/lib/init/init-d-script +### BEGIN INIT INFO +# Provides: blockdomains +# Required-Start: mountkernfs $local_fs +# X-Start-Before: $network +# Required-Stop: mountkernfs $local_fs +# Default-Start: S +# Default-Stop: 0 6 +# Short-Description: Block selected HTTP and HTTPS connections +# Description: Firewall agent that blocks outbound connections +# for selected domains +### END INIT INFO +DAEMON=/usr/sbin/blockdomains +DAEMON_ARGS="$(ls /etc/blockdomains/blocked/*.acl 2>/dev/null)" +START_ARGS="-b -O /var/log/blockdomains.log" +PIDFILE=none + +do_start_prepare() { + iptables -I OUTPUT -p tcp -j NFQUEUE --queue-num 99 +} + +do_stop_prepare() { + iptables -D OUTPUT -p tcp -j NFQUEUE --queue-num 99 +} diff --git a/src/blockdomains.c b/src/blockdomains.c new file mode 100644 index 0000000..0d551c5 --- /dev/null +++ b/src/blockdomains.c @@ -0,0 +1,340 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for NF_ACCEPT */ + +#include + +// Caching of verdicts +unsigned int lookup_cache(unsigned char *domain); +void add_cache(unsigned char *domain,unsigned int ix); +int hash_code(unsigned char *domain); + +// BAD domains database +unsigned int check_domain(unsigned char *domain); +void load_domains(char *file); +void start_domain_database_loading(void); +void end_domain_database_loading(void); + +/** + * Return packet id, or 0 on error. + */ +static u_int32_t get_packet_id(struct nfq_data *tb) { + struct nfqnl_msg_packet_hdr *ph = nfq_get_msg_packet_hdr( tb ); + return ( ph )? ntohl( ph->packet_id ) : 0; +} + +struct ipv4_pkt { + struct ip first; // .ip_dst[4 bytes] + struct tcphdr second; +}; + +struct ipv6_pkt { + struct ip6_hdr first; // .ip6_dst[16 bytes] + struct tcphdr second; +}; + +// Payload packet +struct packet { + union { + struct ipv4_pkt pkt4; + struct ipv6_pkt pkt6; + } p; + //unsigned char pad[12]; // ?? +}; + +static struct packet *get_headerP(unsigned char *data) { + return (struct packet *) data; +} + +///////// Debugging +//const char *inet_ntop(int af, const void *restrict src, +// char dst[restrict .size], socklen_t size); + +static const char *tell_ip(struct packet *ip) { + static char THEIP[200]; + switch ( ip->p.pkt4.first.ip_v ) { + case 4: + return inet_ntop( AF_INET, &ip->p.pkt4.first.ip_dst, THEIP, 200 ); + case 6: + return inet_ntop( AF_INET6, &ip->p.pkt6.first.ip6_dst, THEIP, 200 ); + } + snprintf( THEIP, 200, "%d ???", ip->p.pkt4.first.ip_v ); + return THEIP; +} + +/** + * Review payload packet payload + */ +static void view_payload(unsigned char *data,int length) { + struct packet *header = get_headerP( data ); + u_int16_t port = 0; + u_int8_t syn = 0; + unsigned char *body = data ;//+ sizeof( struct packet ); + switch ( header->p.pkt4.first.ip_v ) { + case 4: + port = ntohs( ((struct ipv4_pkt *) data )->second.th_dport ); + syn = sizeof( struct ipv4_pkt ); + break; + case 6: + port = ntohs( ((struct ipv6_pkt *) data )->second.th_dport ); + syn = sizeof( struct ipv6_pkt ); + break; + } +#define END 400 + unsigned char * end = body + ( ( length > END )? END : length ); + fprintf( stderr, "%s %d %d %d ", tell_ip( header ), syn, port, length ); + while ( body < end ) { + unsigned char c = *body++; + if ( c < ' ' || c >= 127 || 1 ) { + fprintf( stderr, "%02x ", c ); + } else { + fprintf( stderr, "%c", c ); + } + } + fprintf( stderr, "\n" ); +} + +////////////////// +static unsigned char buffer[1000]; + +/** + * SSL traffic includes a data packet with a clear text host name. + * This is knwon as the SNI extension. + */ +static unsigned char *ssl_host(unsigned char *data,int length) { + // Check that it's a "Client Hello" message + unsigned char *p; + switch ( ((struct packet *) data)->p.pkt4.first.ip_v ) { + case 4: + p = data + sizeof( struct ipv4_pkt ) + 12; //?? + break; + case 6: + p = data + sizeof( struct ipv6_pkt ) + 0; //?? + break; + default: + return 0; + } + if ( p[0] != 0x16 || p[1] != 0x03 || p[5] != 0x01 || p[6] != 0x00 ) { + return 0; + } + fprintf( stderr, "Client Hello\n" ); + // Note minor version p[2] is not checked + // record_length = 256 * p[3] + p[4] + // handshake_message_length = 256 * p[7] + p[8] + if ( p[9] != 0x03 || p[10] != 0x03 ) { // TLS 1.2 (?ralph?) + return 0; + } + fprintf( stderr, "TLS 1.2\n" ); + unsigned int i = 46 + ( 256 * p[44] ) + p[45]; + i += p[i] + 1; + unsigned int extensions_length = ( 256 * p[i] ) + p[i+1]; + i += 2; + int k = 0; + fprintf( stderr, "TLS 1.2 %d %d\n", i, extensions_length ); + while ( k < extensions_length ) { + unsigned int type = ( 256 * p[i+k] ) + p[i+k+1]; + k += 2; + unsigned int length = ( 256 * p[i+k] ) + p[i+k+1]; + k += 2; + fprintf( stderr, "Extension %d %d\n", k-4, type ); + if ( type == 0 ) { // Server Name + if ( p[i+k+2] ) { + break; // Name badness + } + unsigned int name_length = ( 256 * p[i+k+3] ) + p[i+k+4]; + unsigned char *path = &p[i+k+5]; + memcpy( buffer, path, name_length ); + buffer[ name_length ] = '\0'; + return buffer; + } + k += length; + } + // This point is only reached on "missing or bad SNI". + view_payload( data, length ); + return 0; +} + +/** + * HTTP traffic includes a data packet with the host name as a + * "Host:" attribute. + */ +static unsigned char *http_host(unsigned char *data,int length) { + unsigned char *body = data + sizeof( struct packet ); + switch ( ((struct packet *) data)->p.pkt4.first.ip_v ) { + case 4: + body = data + sizeof( struct ipv4_pkt ); + break; + case 6: + body = data + sizeof( struct ipv6_pkt ); + break; + default: + return 0; + } + if ( ( strncmp( (char*) body, "GET ", 4 ) != 0 ) && + ( strncmp( (char*) body, "POST ", 5 ) != 0 ) ) { + return 0; + } + unsigned char *end = data + length - 6; + int check = 0; + for ( ; body < end; body++ ) { + if ( check ) { + if ( strncmp( (char*) body, "Host:", 5 ) == 0 ) { + body += 5; + for( ; body < end; body++ ) if ( *body != ' ' ) break; + unsigned char *start = body; + int n = 0; + for( ; body < end; n++, body++ ) if ( *body <= ' ' ) break; + if ( n < 5 ) { + return 0; + } + memcpy( buffer, start, n ); + buffer[ n ] = '\0'; + return buffer; + } + if ( strncmp( (char*) body, "\r\n", 2 ) == 0 ) { + return 0; + } + for( ; body < end; body++ ) if ( *body == '\n' ) break; + if ( body >= end ) { + return 0; + } + } + check = ( *body == '\n' ); + } + return 0; +} + +/** + * Callback function to handle a packet. + */ +static int cb( + struct nfq_q_handle *qh, + struct nfgenmsg *nfmsg, + struct nfq_data *nfa, void *code ) +{ + u_int32_t id = get_packet_id( nfa ); + unsigned char *data; + int length = nfq_get_payload( nfa, &data); + int verdict = NF_ACCEPT; + struct packet *header = get_headerP( data ); +#if 0 + fprintf( stderr, "PKT %s %d\n", tell_ip( header ), length ); +#endif + if ( length >= 100 ) { + unsigned char *host = http_host( data, length ); +#if 1 + if ( host ) { + fprintf( stderr, "HTTP HOST %s %s\n", tell_ip( header ), host ); + } +#endif + if ( host == 0 ) { + host = ssl_host( data, length ); +#if 1 + if ( host ) { + fprintf( stderr, "SSL HOST %s %s\n", tell_ip( header ), host ); + } +#endif + } + if ( host ) { + int i = lookup_cache( host ); + if ( i < 0 ) { + unsigned int ix = check_domain( host ); + add_cache( host, ix ); +#if 1 + fprintf( stderr, "%s %d %s ** %d\n", + tell_ip( header ), hash_code( host ), host, ix ); +#endif + if ( ix > 0 ) { + verdict = NF_DROP; + } + } else if ( i > 0 ) { + verdict = NF_DROP; + } + } + } + return nfq_set_verdict(qh, id, verdict, 0, NULL); +} + +/** + * Program main function. + */ +int main(int argc, char **argv) { + // Load the database + start_domain_database_loading(); + int n = 1; + for ( ; n < argc; n++ ) { + fprintf( stderr, "Loading blacklist %s\n", argv[ n ] ); + load_domains( argv[ n ] ); + } + end_domain_database_loading(); + + struct nfq_handle *h; + struct nfq_q_handle *qh; + //struct nfnl_handle *nh; + int fd; + int rv; + char buf[4096] __attribute__ ((aligned)); + + fprintf( stderr, "opening library handle\n"); + h = nfq_open(); + if ( !h ) { + fprintf(stderr, "error during nfq_open()\n"); + exit(1); + } + + fprintf( stderr, "unbinding any existing nf_queue handler\n" ); + if ( nfq_unbind_pf(h, AF_INET) < 0 ) { + fprintf(stderr, "error during nfq_unbind_pf()\n"); + exit(1); + } + + fprintf( stderr, "binding nfnetlink_queue as nf_queue handler\n" ); + if ( nfq_bind_pf(h, AF_INET) < 0 ) { + fprintf(stderr, "error during nfq_bind_pf()\n"); + exit(1); + } + +#define THEQUEUE 99 + fprintf( stderr, "binding this socket to queue '%d'\n", THEQUEUE ); + qh = nfq_create_queue( h, THEQUEUE, &cb, NULL ); + if ( !qh ) { + fprintf(stderr, "error during nfq_create_queue()\n"); + exit(1); + } + + fprintf( stderr, "setting copy_packet mode\n" ); + if ( nfq_set_mode(qh, NFQNL_COPY_PACKET, 0xffff ) < 0) { + fprintf(stderr, "can't set packet_copy mode\n"); + exit(1); + } + + fd = nfq_fd( h ); + + while ( ( rv = recv(fd, buf, sizeof(buf), 0) ) && rv >= 0 ) { + //printf( "pkt received\n" ); + nfq_handle_packet(h, buf, rv); + } + + fprintf( stderr, "unbinding from queue %d\n", THEQUEUE); + nfq_destroy_queue(qh); + +#ifdef INSANE + /* normally, applications SHOULD NOT issue this command, since it + detaches other programs/sockets from AF_INET, too ! */ + fprintf( stderr, "unbinding from AF_INET\n"); + nfq_unbind_pf(h, AF_INET); +#endif + + fprintf( stderr, "closing library handle\n"); + nfq_close( h ); + + exit( 0 ); +} diff --git a/src/cache.c b/src/cache.c new file mode 100644 index 0000000..2afb37a --- /dev/null +++ b/src/cache.c @@ -0,0 +1,45 @@ +#include +#include +#include + +typedef struct _CacheEntry { + unsigned char *domain; + unsigned int ix; +} CacheEntry; + +struct { + CacheEntry *table; + int size; +} cache; + +int hash_code(unsigned char *domain) { + int i = 0; + for ( ; *domain; domain++ ) { + i += *domain; + } + return i % cache.size; +} + +int lookup_cache(unsigned char *domain) { + if ( cache.table ) { + int i = hash_code( domain ); + if ( cache.table[i].domain && + strcmp( (char*) domain, (char*) cache.table[i].domain ) == 0 ) { + return cache.table[i].ix; + } + } + return -1; +} + +void add_cache(unsigned char *domain,unsigned int ix) { + if ( cache.table == 0 ) { + cache.size = 1024; + cache.table = (CacheEntry*) calloc( cache.size, sizeof( CacheEntry ) ); + } + int i = hash_code( domain ); + if ( cache.table[i].domain ) { + free( cache.table[i].domain ); + } + cache.table[i].domain = (unsigned char*) strdup( (char*) domain ); + cache.table[i].ix = ix; +} diff --git a/src/database.c b/src/database.c new file mode 100644 index 0000000..02bb8dc --- /dev/null +++ b/src/database.c @@ -0,0 +1,280 @@ +#include +#include +#include +#include +#include +#include +#include + +/** + * This file implements a "database" of "bad" domains, loaded from + * ".acl" files of a fairly strict format; each domain to block is + * written on a line starting with a period, immediately followed by + * the domain to block, then an optional comment. + * + * The database is populated by using the call sequence: + * 1. start_domain_database_loading(); + * 2. load_domains( filename ); // repeated + * N. end_domain_database_loading(); + * + * The final call triggers a reordering of domains so as to support + * binary search in reverse text order, for matching domain suffixes. + * See the function `tail_compare` for details. + */ + +/** + * This is the Entry type for the "database", which basically is an + * array of these. The domain pointer will point at a domain name in + * the loaded ".acl" file, and length is the domain name length. + */ +typedef struct _Entry { + int length; + unsigned char *domain; +} Entry; + +/** + * This is the domain name database root structure. It holds a pointer + * to the array of Entry records, the fill of that array, and the + * allocated size for that array (no lesser than the fill, of course). + */ +static struct { + Entry *table; + int fill; + int size; +} database = { 0, 0, 0 }; + +/** + * This function compares strings backwars; the last k bytes of string + * (a,na) versus string (b,nb). It also holds '.' as the least of + * characters, so as to ensure that refined/extended domain names are + * comparatively greater that their base domain names. + */ +static int tail_compare(unsigned char *a,unsigned char *b,int k) { + while ( k-- > 0 ) { + int c = *(--a) - *(--b); + if ( c != 0) { + if ( *a == '.' ) { + return -1; + } + if ( *b == '.' ) { + return 1; + } + return c; + } + } + return 0; +} + +/** + * Extend the domain name table to allow additions. + */ +#define STARTSIZE 100000 +static void grow() { + if ( database.table ) { + Entry *old = database.table; + int s = database.size; + database.size += 100000; + database.table = (Entry*) calloc( database.size, sizeof( Entry ) ); + memcpy( database.table, old, s * sizeof( Entry ) ); + free( old ); + } else { + database.table = (Entry*) calloc( STARTSIZE, sizeof( Entry ) ); + database.size = STARTSIZE; + } +} + +/** + * Determine the index for given domain. This matches computes a tail + * match between the given domain and the databse domains, returning + * the index for the matching database entry, or (-index-1) to + * indicate insertion point. In lookup mode, a database entry being a + * tail domain part of the given domain is also considered a match. + */ +static int index_domain(unsigned char *domain,int n,int lookup) { + int lo = 0; + int hi = database.fill; + while ( lo < hi ) { + int m = ( lo + hi ) / 2; + Entry *p = &database.table[ m ]; + int k = p->length; + if ( n < k ) { + k = n; + } + int q = tail_compare( p->domain + p->length, domain + n, k ); +#if 0 + fprintf( stderr, "%s %d %d %d\n", domain, k, m, q ); +#endif + if ( q == 0 ) { + if ( p->length < n ) { + // table entry shorter => new entry after, or match on lookup + if ( lookup && *(domain+n-k-1) == '.' ) { + return m; + } + lo = m + 1; + } else if ( p->length > n ) { + // table entry longer => new entry before + hi = m; + } else { + // equal + return m; + } + } else if ( q < 0 ) { + // new entry after + lo = m + 1; + } else { + // new entry before + hi = m; + } + } + return -lo - 1; +} + +/** + * Determine the length of a "word" + */ +static int wordlen(unsigned char *p) { + unsigned char *q = p; + while ( *q > ' ' ) { + q++; + } + return q - p; +} + +#if 0 +static void add_domain(char *domain) { + if ( database.fill >= database.size ) { + grow(); + } + int length = wordlen( domain ); + int i = index_domain( domain, length, 0 ); + if ( i < 0 ) { + i = -i-1; + int tail = database.fill - i; + if ( tail ) { + memmove( &database.table[ i+1 ], + &database.table[i], + tail * sizeof( Entry ) ); + } + database.table[ i ].domain = domain; + database.table[ i ].length = length; + database.fill++; + } else { + char *p1 = strndup( domain, length ); + char *p2 = strndup( database.table[i].domain, + database.table[i].length ); + fprintf( stderr, "fill = %d %d %s == %s\n", + i, database.fill, p1, p2 ); + free( p1 ); + free( p2 ); + } +} +#endif + +static void fast_add_domain(unsigned char *domain,int length) { + int fill = database.fill; + if ( fill >= database.size ) { + grow(); + } + database.table[ fill ].length = length; + database.table[ fill ].domain = domain; + database.fill++; +} + +static int table_order(Entry *a,Entry *b) { + int k = ( a->length < b->length )? a->length : b->length; + int c = tail_compare( a->domain + a->length, + b->domain + b->length, k ); + if ( c != 0 ) { + return c; + } + return a->length - b->length; +} + +/** + * External call to check a given domain. + */ +unsigned int check_domain(unsigned char *domain) { + int i = index_domain( domain, wordlen( domain ), 1 ); + return ( i < 0 )? 0 : ( i + 1 ); +} + +void start_domain_database_loading(void) { +} + +#if 0 +static void dump_table() { + fprintf( stderr, "Table fill=%d size=%d\n", database.fill, database.size ); + int i = 0; + for ( ; i < database.fill; i++ ) { + char *p = strndup( database.table[i].domain, + database.table[i].length ); + fprintf( stderr, "[%d] %d %p %s\n", + i, database.table[i].length, database.table[i].domain, p ); + free( p ); + } +} +#endif + +void end_domain_database_loading(void) { + qsort( database.table, database.fill, sizeof( Entry ), + (__compar_fn_t) table_order ); + //dump_table(); +} + +/** + * Load BAD domain names from file. The file is line based where data + * lines consist of domain name starting with period and ending with + * space or newline, and other lines ignored. + */ +void load_domains(char *file) { + struct stat info; + unsigned char *data; + //fprintf( stderr, "state(\"%s\",&info)\n", file ); + if ( stat( file, &info ) ) { + perror( file ); + exit( 1 ); + } + int n = info.st_size; + data = (unsigned char *) malloc( n ); + //fprintf( stderr, "open(\"%s\",)\n", file ); + int fd = open( file, O_RDONLY ); + if ( fd < 0 ) { + perror( file ); + exit( 1 ); + } + //fprintf( stderr, "Loading %s\n", file ); + unsigned char *end = data; + while ( n > 0 ) { + int k = read( fd, end, n ); + if ( k == 0 ) { + fprintf( stderr, "Premature EOF for %s\n", file ); + exit( 1 ); + } + end += k; + n -= k; + } + //fprintf( stderr, "processing %s %p %p\n", file, data, end ); + unsigned char *p = data; +#if 0 + int count = 0; +#endif + while( p < end ) { +#if 0 + if ( ( ++count % 10000 ) == 0 ) { + fprintf( stderr, "%d rules\n", count ); + } +#endif + if ( *p == '.' ) { + unsigned char *domain = ++p; + while ( *p > ' ' ) { + p++; + } + fast_add_domain( domain, p - domain ); + } + while ( p < end && *p != '\n' ) { + p++; + } + p++; + } + close( fd ); +}