#!/usr/bin/env python3
"""
Job Hunt Automation Bot
=======================
Scrapes job listings from multiple remote job boards, filters by keywords,
deduplicates results, and outputs formatted listings ready for Telegram.

Sources:
  - RemoteOK  (JSON API — most reliable)
  - OnlineJobs.ph (HTML scraping)
  - Freelancer (HTML scraping)

Usage:
  python job_hunt_bot.py              # print formatted results
  python job_hunt_bot.py --json       # dump raw JSON
  python job_hunt_bot.py --telegram   # Telegram-friendly output (default)
"""

import json
import hashlib
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import requests
from bs4 import BeautifulSoup

# ──────────────────────────────────────────────────────────────────────
# CONFIGURATION — edit this section to customise the bot
# ──────────────────────────────────────────────────────────────────────

CONFIG = {
    # Keywords to search for (each board is queried separately)
    "keywords": [
        "Python",
        "Automation",
        "AI",
        "Developer",
        "Remote",
        "Machine Learning",
    ],

    # Relevance filter — a listing must match at least one keyword
    # in its title or description (case-insensitive)
    "relevance_keywords": [
        "python", "automation", "ai", "developer", "remote",
        "machine learning", "software", "backend", "full stack",
        "freelance", "scripting", "bot", "data", "engineer",
        "devops", "django", "flask", "fastapi", "api",
    ],

    # Minimum relevance keyword matches (in title+description) to keep a listing
    "min_relevance_score": 1,

    # Max results per board
    "max_results_per_board": 15,

    # Request settings
    "request_timeout": 20,        # seconds
    "request_delay": 1.5,         # polite delay between requests
    "user_agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),

    # Paths (relative to this script's directory)
    "seen_jobs_file": "seen_jobs.json",
    "output_file": "job_results.json",
}

# ──────────────────────────────────────────────────────────────────────
# HELPERS
# ──────────────────────────────────────────────────────────────────────

def get_script_dir() -> Path:
    """Return the directory containing this script (or CWD if frozen)."""
    return Path(__file__).resolve().parent


def load_seen_jobs(path: Path) -> set:
    """Load the set of previously-seen job hashes from disk."""
    if path.exists():
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            return set(data)
        except (json.JSONDecodeError, TypeError):
            return set()
    return set()


def save_seen_jobs(path: Path, seen: set) -> None:
    """Persist the seen-jobs set to disk."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(sorted(seen), f, indent=2)


def job_hash(job: dict) -> str:
    """Create a stable hash for deduplication based on title + url."""
    raw = f"{job.get('title', '').lower().strip()}|{job.get('url', '').strip()}"
    return hashlib.md5(raw.encode()).hexdigest()


def relevance_score(job: dict, keywords: list[str]) -> int:
    """Count how many relevance keywords appear in the title or description."""
    text = f"{job.get('title', '')} {job.get('description', '')}".lower()
    return sum(1 for kw in keywords if kw.lower() in text)


def safe_request(url: str, **kwargs) -> Optional[requests.Response]:
    """Make an HTTP GET with default headers and error handling."""
    headers = kwargs.pop("headers", {})
    headers.setdefault("User-Agent", CONFIG["user_agent"])
    timeout = kwargs.pop("timeout", CONFIG["request_timeout"])

    try:
        resp = requests.get(url, headers=headers, timeout=timeout, **kwargs)
        resp.raise_for_status()
        return resp
    except requests.RequestException as e:
        print(f"  [!] Request failed for {url}: {e}", file=sys.stderr)
        return None


def clean_text(text: str) -> str:
    """Strip and collapse whitespace."""
    return re.sub(r"\s+", " ", text).strip() if text else ""


def format_telegram(jobs: list[dict]) -> str:
    """Format a list of jobs into a clean Telegram-friendly message."""
    if not jobs:
        return "🔍 *No new matching jobs found right now.*\n\nTry again later or adjust keywords."

    lines = [f"🎯 *Job Hunt Results* — {len(jobs)} new listing(s)\n"]

    for i, job in enumerate(jobs, 1):
        title = job.get("title", "Untitled")
        company = job.get("company", "Unknown")
        location = job.get("location", "—")
        salary = job.get("salary", "")
        url = job.get("url", "")
        source = job.get("source", "")
        posted = job.get("posted", "")

        lines.append(f"*{i}. {title}*")
        lines.append(f"   🏢 {company}")
        lines.append(f"   📍 {location}")
        if salary:
            lines.append(f"   💰 {salary}")
        if posted:
            lines.append(f"   📅 {posted}")
        lines.append(f"   🔗 [View listing]({url})")
        lines.append(f"   📌 _via {source}_")
        lines.append("")

    lines.append(f"_Scanned at {datetime.now().strftime('%Y-%m-%d %H:%M')}_")
    return "\n".join(lines)


# ──────────────────────────────────────────────────────────────────────
# SCRAPERS — one function per job board
# ──────────────────────────────────────────────────────────────────────

def scrape_remoteok() -> list[dict]:
    """
    Scrape RemoteOK using their public JSON API.
    Endpoint: https://remoteok.com/api
    Returns a list of normalised job dicts.
    """
    print("[*] Scraping RemoteOK...")
    jobs = []

    resp = safe_request("https://remoteok.com/api")
    if not resp:
        return jobs

    try:
        data = resp.json()
    except json.JSONDecodeError:
        print("  [!] Failed to decode RemoteOK JSON", file=sys.stderr)
        return jobs

    # First element is metadata, skip it
    if isinstance(data, list) and len(data) > 1:
        data = data[1:]

    max_n = CONFIG["max_results_per_board"]
    for item in data[:max_n * 2]:  # fetch extra, we'll filter later
        if not isinstance(item, dict):
            continue

        title = clean_text(item.get("position", ""))
        company = clean_text(item.get("company", ""))
        location = clean_text(item.get("location", ""))
        url = item.get("url", "")
        description = clean_text(item.get("description", ""))

        # Salary — RemoteOK sometimes has "salary_min"/"salary_max" or tags
        salary_min = item.get("salary_min")
        salary_max = item.get("salary_max")
        salary = ""
        if salary_min or salary_max:
            lo = f"${salary_min:,.0f}" if salary_min else "?"
            hi = f"${salary_max:,.0f}" if salary_max else "?"
            salary = f"{lo} – {hi} / yr"

        # Tags
        tags = item.get("tags", [])
        if isinstance(tags, list):
            description += " " + " ".join(str(t) for t in tags)

        # Date
        posted = item.get("date", "")

        if title and url:
            if not url.startswith("http"):
                url = "https://remoteok.com" + url
            jobs.append({
                "title": title,
                "company": company or "RemoteOK Listing",
                "location": location or "Remote",
                "salary": salary,
                "url": url,
                "source": "RemoteOK",
                "posted": posted[:10] if posted else "",
                "description": description,
            })

    print(f"  [+] RemoteOK: {len(jobs)} raw listings fetched")
    return jobs[:max_n]


def scrape_onlinejobs_ph() -> list[dict]:
    """
    Scrape OnlineJobs.ph job search results.
    Searches for each keyword and collects results.
    """
    print("[*] Scraping OnlineJobs.ph...")
    jobs = []
    max_n = CONFIG["max_results_per_board"]

    # Use first few keywords as search terms
    search_terms = CONFIG["keywords"][:3]

    for term in search_terms:
        if len(jobs) >= max_n:
            break

        url = "https://www.onlinejobs.ph/jobseekers/jobsearch"
        params = {"jobkeyword": term, "page": 1}

        resp = safe_request(url, params=params)
        if not resp:
            time.sleep(CONFIG["request_delay"])
            continue

        soup = BeautifulSoup(resp.text, "html.parser")

        # OnlineJobs.ph uses job listing cards
        # Try multiple selectors for robustness
        listings = (
            soup.select("div.job-listing")
            or soup.select("div.card-body")
            or soup.select("article")
            or soup.select("div[class*='job']")
        )

        for card in listings:
            if len(jobs) >= max_n:
                break

            # Extract title
            title_el = (
                card.select_one("h2 a")
                or card.select_one("h3 a")
                or card.select_one("a.job-title")
                or card.select_one("a[href*='/job/']")
            )
            title = clean_text(title_el.get_text()) if title_el else ""
            href = title_el.get("href", "") if title_el else ""

            # Extract company
            company_el = (
                card.select_one("span.company-name")
                or card.select_one("a.company")
                or card.select_one("p.company")
                or card.select_one("div[class*='company']")
            )
            company = clean_text(company_el.get_text()) if company_el else "OnlineJobs.ph Employer"

            # Extract salary
            salary_el = (
                card.select_one("span.salary")
                or card.select_one("div.salary")
                or card.select_one("p[class*='salary']")
                or card.select_one("span[class*='salary']")
            )
            salary = clean_text(salary_el.get_text()) if salary_el else ""

            # Extract location
            loc_el = (
                card.select_one("span.location")
                or card.select_one("div.location")
                or card.select_one("span[class*='location']")
            )
            location = clean_text(loc_el.get_text()) if loc_el else "Philippines (Remote)"

            # Build full URL
            if href and not href.startswith("http"):
                href = "https://www.onlinejobs.ph" + href

            if title and href:
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "salary": salary,
                    "url": href,
                    "source": "OnlineJobs.ph",
                    "posted": "",
                    "description": f"{title} {term}",
                })

        time.sleep(CONFIG["request_delay"])

    # Deduplicate within this board (by URL)
    seen_urls = set()
    unique_jobs = []
    for j in jobs:
        if j["url"] not in seen_urls:
            seen_urls.add(j["url"])
            unique_jobs.append(j)

    print(f"  [+] OnlineJobs.ph: {len(unique_jobs)} raw listings fetched")
    return unique_jobs


def scrape_freelancer() -> list[dict]:
    """
    Scrape Freelancer.com job listings.
    Uses their public jobs search page.
    """
    print("[*] Scraping Freelancer.com...")
    jobs = []
    max_n = CONFIG["max_results_per_board"]

    # Freelancer search URLs for a few keywords
    search_terms = CONFIG["keywords"][:3]

    for term in search_terms:
        if len(jobs) >= max_n:
            break

        search_url = f"https://www.freelancer.com/jobs/{term.lower().replace(' ', '-')}/"
        resp = safe_request(search_url)
        if not resp:
            time.sleep(CONFIG["request_delay"])
            continue

        soup = BeautifulSoup(resp.text, "html.parser")

        # Freelancer uses various card layouts — try several selectors
        listings = (
            soup.select("div.JobSearchCard-item")
            or soup.select("div[class*='JobSearchCard']")
            or soup.select("div.JobCard")
            or soup.select("div[class*='job-card']")
            or soup.select("a[href*='/projects/']")
        )

        for card in listings:
            if len(jobs) >= max_n:
                break

            # Title
            title_el = (
                card.select_one("a.JobSearchCard-primary-heading-link")
                or card.select_one("a[class*='heading']")
                or card.select_one("h2 a")
                or card.select_one("h3 a")
                or (card if card.name == "a" else card.select_one("a"))
            )
            title = clean_text(title_el.get_text()) if title_el else ""
            href = ""
            if title_el:
                href = title_el.get("href", "")

            # Company / poster
            company_el = (
                card.select_one("a.JobSearchCard-primary-heading")
                or card.select_one("span[class*='employer']")
                or card.select_one("a[class*='employer']")
            )
            company = clean_text(company_el.get_text()) if company_el else "Freelancer Client"

            # Budget / salary
            budget_el = (
                card.select_one("div.JobSearchCard-secondary-price")
                or card.select_one("span[class*='price']")
                or card.select_one("div[class*='budget']")
            )
            salary = clean_text(budget_el.get_text()) if budget_el else ""

            # Location
            loc_el = (
                card.select_one("span.JobSearchCard-secondary-location")
                or card.select_one("span[class*='location']")
            )
            location = clean_text(loc_el.get_text()) if loc_el else "Remote"

            # Build URL
            if href and not href.startswith("http"):
                href = "https://www.freelancer.com" + href

            if title and href:
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "salary": salary,
                    "url": href,
                    "source": "Freelancer",
                    "posted": "",
                    "description": f"{title} {term}",
                })

        time.sleep(CONFIG["request_delay"])

    # Deduplicate within board
    seen_urls = set()
    unique_jobs = []
    for j in jobs:
        if j["url"] not in seen_urls:
            seen_urls.add(j["url"])
            unique_jobs.append(j)

    print(f"  [+] Freelancer: {len(unique_jobs)} raw listings fetched")
    return unique_jobs


# ──────────────────────────────────────────────────────────────────────
# MAIN PIPELINE
# ──────────────────────────────────────────────────────────────────────

def run_scraper() -> list[dict]:
    """
    Full pipeline: scrape → deduplicate → filter by relevance → save.
    Returns the list of new, filtered jobs.
    """
    script_dir = get_script_dir()
    seen_path = script_dir / CONFIG["seen_jobs_file"]
    output_path = script_dir / CONFIG["output_file"]

    # 1. Load previously seen jobs
    seen_hashes = load_seen_jobs(seen_path)
    print(f"[*] Loaded {len(seen_hashes)} previously-seen job hashes")

    # 2. Scrape all boards
    all_jobs: list[dict] = []
    scrapers = [
        ("RemoteOK", scrape_remoteok),
        ("OnlineJobs.ph", scrape_onlinejobs_ph),
        ("Freelancer", scrape_freelancer),
    ]

    for name, scraper_fn in scrapers:
        try:
            board_jobs = scraper_fn()
            all_jobs.extend(board_jobs)
        except Exception as e:
            print(f"  [!] Error scraping {name}: {e}", file=sys.stderr)

    print(f"\n[*] Total raw listings: {len(all_jobs)}")

    # 3. Deduplicate against previously seen
    new_jobs = []
    new_hashes = set()
    for job in all_jobs:
        h = job_hash(job)
        if h not in seen_hashes:
            new_jobs.append(job)
            new_hashes.add(h)

    print(f"[*] New listings (not seen before): {len(new_jobs)}")

    # 4. Filter by relevance
    keywords = CONFIG["relevance_keywords"]
    min_score = CONFIG["min_relevance_score"]
    relevant = [
        j for j in new_jobs
        if relevance_score(j, keywords) >= min_score
    ]
    print(f"[*] Relevant listings after filtering: {len(relevant)}")

    # 5. Update seen-jobs store
    seen_hashes.update(new_hashes)
    save_seen_jobs(seen_path, seen_hashes)

    # 6. Save full results to JSON (for debugging / auditing)
    # Strip the internal description field before saving
    output_jobs = []
    for j in relevant:
        out = {k: v for k, v in j.items() if k != "description"}
        output_jobs.append(out)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_jobs, f, indent=2, ensure_ascii=False)
    print(f"[*] Results saved to {output_path}")

    return relevant


def main():
    """
    Entry point. Run directly or import and call run_scraper().
    """
    print("=" * 60)
    print("  🤖  Job Hunt Automation Bot")
    print(f"  ⏰  {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    # Check for CLI flags
    output_json = "--json" in sys.argv

    jobs = run_scraper()

    print("\n" + "=" * 60)

    if output_json:
        # Raw JSON dump
        clean = [{k: v for k, v in j.items() if k != "description"} for j in jobs]
        print(json.dumps(clean, indent=2, ensure_ascii=False))
    else:
        # Telegram-formatted output
        print(format_telegram(jobs))

    print("=" * 60)
    print(f"\n✅ Done. {len(jobs)} job(s) to review.")
    return jobs


if __name__ == "__main__":
    main()