From c6d1a9ce6ec85d5ca774cd88da83b5b975ec7910 Mon Sep 17 00:00:00 2001 From: "Casper V. Kristensen" Date: Sun, 12 May 2019 21:01:15 +0200 Subject: [PATCH] Revert to using timestamp instead of expire in cache - cache cleaning cutoff configurable instead. --- dailyreleases/cache.py | 74 +++++++++++++++++-------------------- dailyreleases/stores/web.py | 3 +- 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/dailyreleases/cache.py b/dailyreleases/cache.py index 103eb28..a27b11f 100644 --- a/dailyreleases/cache.py +++ b/dailyreleases/cache.py @@ -13,6 +13,13 @@ from .config import DATA_DIR, CONFIG logger = logging.getLogger(__name__) +connection = sqlite3.connect(DATA_DIR.joinpath("cache.sqlite")) +connection.row_factory = sqlite3.Row # allow accessing rows by index and case-insensitively by name +connection.text_factory = bytes # do not try to decode bytes as utf-8 strings + +DEFAULT_CACHE_TIME = timedelta(seconds=CONFIG["web"].getint("cache_time")) +logger.info("Default cache time is %s", DEFAULT_CACHE_TIME) + class Response: def __init__(self, bytes: bytes = None) -> None: @@ -24,33 +31,23 @@ class Response: return json.loads(self.bytes) -connection = sqlite3.connect(DATA_DIR.joinpath("cache.sqlite")) -connection.row_factory = sqlite3.Row # allow accessing rows by index and case-insensitively by name -connection.text_factory = bytes # do not try to decode bytes as utf-8 strings - -CACHE_TIME = timedelta(seconds=CONFIG["web"].getint("cache_time")) -logger.info("Default cache time is %s", CACHE_TIME) - -connection.execute( - """ - CREATE TABLE IF NOT EXISTS - requests (id INTEGER PRIMARY KEY, - url TEXT UNIQUE NOT NULL, - response BLOB NOT NULL, - expire INTEGER NOT NULL); - """ -) +def setup(): + connection.execute(""" + CREATE TABLE IF NOT EXISTS + requests (id INTEGER PRIMARY KEY, + url TEXT UNIQUE NOT NULL, + response BLOB NOT NULL, + timestamp INTEGER NOT NULL); + """) -def clean(): - connection.execute( - """ +def clean(older_than=timedelta(days=3)): + connection.execute(""" DELETE FROM requests - WHERE expire < :expire; + WHERE timestamp < :cutoff; """, { - "expire": datetime.utcnow().timestamp(), - } - ) + "cutoff": (datetime.utcnow() - older_than).timestamp(), + }) connection.execute("VACUUM;") connection.commit() @@ -58,15 +55,11 @@ def clean(): last_request = defaultdict(float) -def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME, +def get(url: str, params: Mapping = None, cache_time: timedelta = DEFAULT_CACHE_TIME, ratelimit: Optional[float] = 1, *args, **kwargs) -> Response: """ Sends a GET request, caching the result for cache_time. If 'ratelimit' is supplied, requests are rate limited at the host-level to this number of requests per second. - - We're saving requests' expire instead of the timestamp it was received to allow for varying cache times; if we were - saving the timestamp, clean() wouldn't know when to delete unless the cache time was always the same. This, however, - also means that the first call determines for how longer subsequent calls will consider a request fresh. """ if params is not None: url += "?" + urllib.parse.urlencode(params) @@ -75,17 +68,15 @@ def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME, #logger.debug("Get %s", url) - row = connection.execute( - """ - SELECT response, expire + row = connection.execute(""" + SELECT response, timestamp FROM requests WHERE url = :url; """, { "url": url - } - ).fetchone() + }).fetchone() - if row is not None and datetime.fromtimestamp(row["expire"]) > datetime.utcnow(): + if row is not None and datetime.fromtimestamp(row["timestamp"]) > datetime.utcnow() - cache_time: #logger.debug("Cache hit: %s", url) return Response(row["response"]) @@ -100,15 +91,16 @@ def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME, response = Response(urlopen(request).read()) last_request[request.host] = time.time() - connection.execute( - """ - INSERT OR REPLACE INTO requests(url, response, expire) - VALUES (:url, :response, :expire); + connection.execute(""" + INSERT OR REPLACE INTO requests(url, response, timestamp) + VALUES (:url, :response, :timestamp); """, { "url": url, "response": response.bytes, - "expire": (datetime.utcnow() + cache_time).timestamp() - } - ) + "timestamp": datetime.utcnow().timestamp() + }) connection.commit() return response + + +setup() diff --git a/dailyreleases/stores/web.py b/dailyreleases/stores/web.py index 1f88d7c..4e6ae1f 100644 --- a/dailyreleases/stores/web.py +++ b/dailyreleases/stores/web.py @@ -11,8 +11,7 @@ logger = logging.getLogger(__name__) def web_search(query: str) -> List[str]: logger.debug("Searching Google for %s", query) try: - # disable rate-limiting since we have a proper API-key (unlike the other APIs we are using) - r = cache.get("https://www.googleapis.com/customsearch/v1", ratelimit=None, params={ + r = cache.get("https://www.googleapis.com/customsearch/v1", params={ "key": CONFIG["google"]["key"], "cx": CONFIG["google"]["cx"], "q": query