blob: 41f62179e91088a0799b02e8881e052c2aaa0c5a [file]
# Copyright 2026 The Pigweed Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
"""Downloads and formats Bluetooth core specification documents."""
import os
import re
import sys
from typing import Optional
from urllib.request import urlopen, Request
from urllib.error import URLError
from urllib.parse import urlparse
from bs4 import BeautifulSoup, Tag
# Root project directory for specification cache
SPEC_DIR = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"specifications",
)
# Hardcoded list of official Bluetooth Core Specification URLs
CORE_SPEC_URLS = [
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host-controller-interface/"
"host-controller-interface-functional-specification.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host/"
"logical-link-control-and-adaptation-protocol-specification.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host/attribute-protocol--att-.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host/generic-access-profile.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host/generic-attribute-profile--gatt-.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/host/security-manager-specification.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/br-edr-controller/"
"link-manager-protocol-specification.html"
),
(
"https://www.bluetooth.com/wp-content/uploads/Files/Specification/"
"HTML/Core-62/out/en/low-energy-controller/"
"link-layer-specification.html"
),
]
def fetch_html(url: str) -> Optional[str]:
"""Downloads HTML content from the given URL."""
print(f"Downloading: {url}...")
headers = {'User-Agent': 'Mozilla/5.0'}
try:
req = Request(url, headers=headers)
with urlopen(req, timeout=30) as response:
return response.read().decode('utf-8')
except URLError as e:
print(f"Error downloading {url}: {e}")
return None
def extract_metadata_header(soup: BeautifulSoup, url: str) -> str:
"""Extracts the document title and formats an initial metadata header."""
meta_header = f"Source URL: {url}\n"
title_div = soup.find('div', class_='titlepage')
if title_div:
title_element = title_div.find(class_='title')
raw_title_source = title_element if title_element else title_div
raw_title = raw_title_source.get_text(separator=' ', strip=True)
# Remove internal SIG codenames/revisions like 'vAtlanta r00'
clean_title = re.sub(r'\bv[A-Za-z]+\s+r\d{2,}\b', '', raw_title)
# Clean up any double spaces left behind
clean_title = re.sub(r'\s{2,}', ' ', clean_title).strip()
meta_header += clean_title + "\n"
meta_header += "=" * max(10, len(clean_title)) + "\n\n"
return meta_header
def stitch_headers(content_root: Tag) -> None:
"""Combines section numbers and titles into single Markdown headers."""
for header in content_root.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
num = header.find("span", class_="formal-number")
title = header.find("span", class_="formal-title")
if num and title:
# Prefix with Markdown header hashes based on heading level
level = int(header.name[1])
hashes = "#" * level
stitched = (
f"{hashes} {num.get_text().strip()}. {title.get_text().strip()}"
)
header.clear()
header.append(stitched)
def convert_table_to_markdown(table: Tag) -> str:
"""Converts a BeautifulSoup table element into a Markdown table string."""
rows = []
max_cols = 0
for tr in table.find_all('tr'):
cells = []
for cell in tr.find_all(['th', 'td']):
# Clean up cell text: remove extra newlines/spaces and escape pipes
text = (
cell.get_text(separator=' ', strip=True)
.replace('|', '\\|')
.replace('\n', ' ')
)
cells.append(text)
max_cols = max(max_cols, len(cells))
if cells:
rows.append(f"| {' | '.join(cells)} |")
if not rows:
return ""
# Create the separator row
if max_cols > 0:
separator = f"|{'---|' * max_cols}"
# Insert after header if possible
if len(rows) > 1:
rows.insert(1, separator)
else:
rows.append(separator)
return '\n\n' + '\n'.join(rows) + '\n\n'
def replace_tables_with_markdown(
content_root: Tag, soup: BeautifulSoup
) -> None:
"""Finds all tables in the HTML and replaces them with Markdown text."""
for table in content_root.find_all('table'):
md_table = convert_table_to_markdown(table)
table.replace_with(soup.new_string(md_table))
def extract_clean_text(meta_header: str, content_root: Tag) -> str:
"""Extracts text content and cleans up excessive newlines."""
text_content = content_root.get_text(separator='\n', strip=True)
# Remove internal SIG codenames/revisions like 'vAtlanta r00'
# (on its own line)
text_content = re.sub(r'(?im)^v[a-z]+\s+r\d{2,}\n?$', '', text_content)
full_text = meta_header + text_content
return re.sub(r'\n{3,}', '\n\n', full_text)
def process_spec(url: str) -> None:
"""
Coordinates the process of downloading, parsing, and saving a specification.
"""
sys.setrecursionlimit(10000)
# Determine filenames
parsed_url = urlparse(url)
page_name = os.path.basename(parsed_url.path).replace('.html', '')
raw_path = os.path.join(SPEC_DIR, f"{page_name}.html")
pretty_path = os.path.join(SPEC_DIR, f"{page_name}_pretty.html")
md_path = os.path.join(SPEC_DIR, f"{page_name}.md")
# Ensure output directory exists
os.makedirs(SPEC_DIR, exist_ok=True)
if (
os.path.exists(md_path)
and os.path.exists(raw_path)
and os.path.exists(pretty_path)
):
print(f"Skipping cached spec: {page_name}")
return
html_content = fetch_html(url)
if not html_content:
return
# Save raw HTML
with open(raw_path, 'w', encoding='utf-8') as f:
f.write(html_content)
soup = BeautifulSoup(html_content, 'html.parser')
# Save prettified HTML
with open(pretty_path, 'w', encoding='utf-8') as f:
f.write(soup.prettify())
meta_header = extract_metadata_header(soup, url)
main_article = soup.find('article', class_='topic')
content_root = main_article if main_article else soup
stitch_headers(content_root)
replace_tables_with_markdown(content_root, soup)
text_content = extract_clean_text(meta_header, content_root)
# Save Markdown
with open(md_path, 'w', encoding='utf-8') as f:
f.write(text_content)
print(f"Generated enhanced Markdown: {md_path}")
def generate_index() -> None:
"""Scans all .md files and generates a global index.md."""
print("\nGenerating Section Index...")
index_entries = []
# Regex to find lines starting with section numbers (prefixed with hashes)
section_pattern = re.compile(r'^(#+)\s+(\d+(\.\d+)*\.)\s+(.+)$')
for filename in sorted(os.listdir(SPEC_DIR)):
if filename.endswith(".md") and filename != "index.md":
filepath = os.path.join(SPEC_DIR, filename)
spec_name = ""
with open(filepath, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
stripped = line.strip()
# The second line is the spec title
if line_num == 2 and not spec_name:
spec_name = stripped
match = section_pattern.match(stripped)
if match:
section_num = match.group(2)
title = match.group(4)
index_entries.append(
f"| {section_num} | {title} | {spec_name} | "
f"{filename}:{line_num} |"
)
index_path = os.path.join(SPEC_DIR, "index.md")
with open(index_path, 'w', encoding='utf-8') as f:
f.write("# Bluetooth Core Specification Index\n\n")
f.write("| Section | Title | Specification | File:Line |\n")
f.write("|---|---|---|---|\n")
# Remove duplicates while keeping order
seen = set()
for entry in index_entries:
if entry not in seen:
f.write(entry + "\n")
seen.add(entry)
print(f"Index created: {index_path}")
if __name__ == '__main__':
for spec_url in CORE_SPEC_URLS:
process_spec(spec_url)
generate_index()
print("\nAll core specifications synchronized and indexed successfully.")