#!/usr/bin/env python # Reads an OONI http_requests report and shows URLs that have known block pages. # # First, make an OONI report: # ooniprobe -i /usr/share/ooni/decks/complete_no_root.deck # Then, # ./findblocks report-http_requests-XXXX.yamloo import getopt import sys import yaml from bs4 import BeautifulSoup # Return (is_block, description) tuple. 4?? and 5?? status codes are considered # blocks. def classify_blockpage(response): soup = BeautifulSoup(response["body"]) title = soup.title code = response["code"] if code == 403 and title is not None and title.get_text() == u"Attention Required! | CloudFlare": return True, "403-CLOUDFLARE" if code // 100 == 4 or code // 100 == 5: return True, "%d-OTHER" % code return False, "%d" % code # Return a (nontor, tor) pair if there are exactly two requests and one is # nontor and one is tor, or else raise an exception. def split_requests(requests): nontor = None tor = None for request in requests: if request.get("failure") is not None: continue if not request["request"]["tor"]["is_tor"]: if nontor is not None: raise ValueError("more than one is_tor:false request") nontor = request else: if tor is not None: raise ValueError("more than one is_tor:true request") tor = request if nontor is None: raise ValueError("no is_tor:false request") if tor is None: raise ValueError("no is_tor:true request") return nontor, tor def process_file(f): yamloo = yaml.safe_load_all(f) # First YAML doc in YAMLOO file is a header with a different format. header = next(yamloo) # Sanity check: make sure a header key is in there. assert "input_hashes" in header for doc in yamloo: if doc["control_failure"] is not None: print >> sys.stderr, "%s: control_failure=%s" % (doc["input"], doc["control_failure"]) continue if doc["experiment_failure"] is not None: print >> sys.stderr, "%s: experiment_failure=%s" % (doc["input"], doc["experiment_failure"]) continue try: nontor, tor = split_requests(doc["requests"]) except ValueError, e: print >> sys.stderr, "%s: %s" % (doc["input"], str(e)) continue nontor_isblocked, nontor_class = classify_blockpage(nontor["response"]) tor_isblocked, tor_class = classify_blockpage(tor["response"]) if nontor_isblocked != tor_isblocked: print "%s\t%s\t%s" % (nontor_class, tor_class, doc["input"]) sys.stdout.flush() def process_filename(filename): with open(filename) as f: return process_file(f) opts, args = getopt.gnu_getopt(sys.argv[1:], "") for filename in args: process_filename(filename)