From c6d1a9ce6ec85d5ca774cd88da83b5b975ec7910 Mon Sep 17 00:00:00 2001
From: "Casper V. Kristensen" <c@spervk.com>
Date: Sun, 12 May 2019 21:01:15 +0200
Subject: [PATCH] Revert to using timestamp instead of expire in cache - cache
 cleaning cutoff configurable instead.

---
 dailyreleases/cache.py      | 74 +++++++++++++++++--------------------
 dailyreleases/stores/web.py |  3 +-
 2 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/dailyreleases/cache.py b/dailyreleases/cache.py
index 103eb28..a27b11f 100644
--- a/dailyreleases/cache.py
+++ b/dailyreleases/cache.py
@@ -13,6 +13,13 @@ from .config import DATA_DIR, CONFIG
 
 logger = logging.getLogger(__name__)
 
+connection = sqlite3.connect(DATA_DIR.joinpath("cache.sqlite"))
+connection.row_factory = sqlite3.Row  # allow accessing rows by index and case-insensitively by name
+connection.text_factory = bytes  # do not try to decode bytes as utf-8 strings
+
+DEFAULT_CACHE_TIME = timedelta(seconds=CONFIG["web"].getint("cache_time"))
+logger.info("Default cache time is %s", DEFAULT_CACHE_TIME)
+
 
 class Response:
     def __init__(self, bytes: bytes = None) -> None:
@@ -24,33 +31,23 @@ class Response:
         return json.loads(self.bytes)
 
 
-connection = sqlite3.connect(DATA_DIR.joinpath("cache.sqlite"))
-connection.row_factory = sqlite3.Row  # allow accessing rows by index and case-insensitively by name
-connection.text_factory = bytes  # do not try to decode bytes as utf-8 strings
-
-CACHE_TIME = timedelta(seconds=CONFIG["web"].getint("cache_time"))
-logger.info("Default cache time is %s", CACHE_TIME)
-
-connection.execute(
-    """
-    CREATE TABLE IF NOT EXISTS
-    requests (id INTEGER PRIMARY KEY,
-              url TEXT UNIQUE NOT NULL,
-              response BLOB NOT NULL,
-              expire INTEGER NOT NULL);
-    """
-)
+def setup():
+    connection.execute("""
+        CREATE TABLE IF NOT EXISTS
+        requests (id INTEGER PRIMARY KEY,
+                  url TEXT UNIQUE NOT NULL,
+                  response BLOB NOT NULL,
+                  timestamp INTEGER NOT NULL);
+        """)
 
 
-def clean():
-    connection.execute(
-        """
+def clean(older_than=timedelta(days=3)):
+    connection.execute("""
         DELETE FROM requests
-        WHERE expire < :expire;
+        WHERE timestamp < :cutoff;
         """, {
-            "expire": datetime.utcnow().timestamp(),
-        }
-    )
+            "cutoff": (datetime.utcnow() - older_than).timestamp(),
+        })
     connection.execute("VACUUM;")
     connection.commit()
 
@@ -58,15 +55,11 @@ def clean():
 last_request = defaultdict(float)
 
 
-def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME,
+def get(url: str, params: Mapping = None, cache_time: timedelta = DEFAULT_CACHE_TIME,
         ratelimit: Optional[float] = 1, *args, **kwargs) -> Response:
     """
     Sends a GET request, caching the result for cache_time. If 'ratelimit' is supplied, requests are rate limited at the
     host-level to this number of requests per second.
-
-    We're saving requests' expire instead of the timestamp it was received to allow for varying cache times; if we were
-    saving the timestamp, clean() wouldn't know when to delete unless the cache time was always the same. This, however,
-    also means that the first call determines for how longer subsequent calls will consider a request fresh.
     """
     if params is not None:
         url += "?" + urllib.parse.urlencode(params)
@@ -75,17 +68,15 @@ def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME,
 
     #logger.debug("Get %s", url)
 
-    row = connection.execute(
-        """
-        SELECT response, expire
+    row = connection.execute("""
+        SELECT response, timestamp
         FROM requests
         WHERE url = :url;
         """, {
             "url": url
-        }
-    ).fetchone()
+        }).fetchone()
 
-    if row is not None and datetime.fromtimestamp(row["expire"]) > datetime.utcnow():
+    if row is not None and datetime.fromtimestamp(row["timestamp"]) > datetime.utcnow() - cache_time:
         #logger.debug("Cache hit: %s", url)
         return Response(row["response"])
 
@@ -100,15 +91,16 @@ def get(url: str, params: Mapping = None, cache_time: timedelta = CACHE_TIME,
 
     response = Response(urlopen(request).read())
     last_request[request.host] = time.time()
-    connection.execute(
-        """
-        INSERT OR REPLACE INTO requests(url, response, expire)
-        VALUES (:url, :response, :expire);
+    connection.execute("""
+        INSERT OR REPLACE INTO requests(url, response, timestamp)
+        VALUES (:url, :response, :timestamp);
         """, {
             "url": url,
             "response": response.bytes,
-            "expire": (datetime.utcnow() + cache_time).timestamp()
-        }
-    )
+            "timestamp": datetime.utcnow().timestamp()
+        })
     connection.commit()
     return response
+
+
+setup()
diff --git a/dailyreleases/stores/web.py b/dailyreleases/stores/web.py
index 1f88d7c..4e6ae1f 100644
--- a/dailyreleases/stores/web.py
+++ b/dailyreleases/stores/web.py
@@ -11,8 +11,7 @@ logger = logging.getLogger(__name__)
 def web_search(query: str) -> List[str]:
     logger.debug("Searching Google for %s", query)
     try:
-        # disable rate-limiting since we have a proper API-key (unlike the other APIs we are using)
-        r = cache.get("https://www.googleapis.com/customsearch/v1", ratelimit=None, params={
+        r = cache.get("https://www.googleapis.com/customsearch/v1", params={
             "key": CONFIG["google"]["key"],
             "cx": CONFIG["google"]["cx"],
             "q": query